summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhuifang <huifangzhang@openailab>2018-01-31 19:00:24 +0800
committerhuifang <huifangzhang@openailab>2018-01-31 19:00:24 +0800
commit1224a143fc631f9f004881dba7a32c6f7ed5e1a5 (patch)
tree2f41c58f0d40c9c76b4df1762c09597397629648
parent7e51f0ff47b9a74f949f5b9ca448d56b6ba3eba5 (diff)
downloadcaffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.gz
caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.bz2
caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.zip
v0.5.0
-rw-r--r--Makefile6
-rw-r--r--Makefile.config.acl15
-rw-r--r--README.md8
-rw-r--r--acl_openailab/accuracy_report.pdfbin0 -> 949615 bytes
-rw-r--r--acl_openailab/installation.md9
-rw-r--r--acl_openailab/performance_report.pdfbin769748 -> 1433625 bytes
-rw-r--r--acl_openailab/user_manual.pdfbin948890 -> 992120 bytes
-rw-r--r--data/ilsvrc12/get_ilsvrc_aux.sh2
-rw-r--r--examples/cpp_classification/classification_profiling_schedule.cpp547
-rw-r--r--include/caffe/acl_layer.hpp278
-rw-r--r--include/caffe/acl_operator.hpp718
-rw-r--r--include/caffe/acl_tensor.hpp114
-rw-r--r--include/caffe/layer.hpp1
-rw-r--r--include/caffe/layers/acl_absval_layer.hpp7
-rw-r--r--include/caffe/layers/acl_base_activation_layer.hpp10
-rw-r--r--include/caffe/layers/acl_base_conv_layer.hpp61
-rw-r--r--include/caffe/layers/acl_batch_norm_layer.hpp9
-rw-r--r--include/caffe/layers/acl_bnll_layer.hpp7
-rw-r--r--include/caffe/layers/acl_concat_layer.hpp12
-rw-r--r--include/caffe/layers/acl_conv_layer.hpp72
-rw-r--r--include/caffe/layers/acl_inner_product_layer.hpp10
-rw-r--r--include/caffe/layers/acl_local_connect_layer.hpp10
-rw-r--r--include/caffe/layers/acl_lrn_layer.hpp9
-rw-r--r--include/caffe/layers/acl_pooling_layer.hpp9
-rw-r--r--include/caffe/layers/acl_relu_layer.hpp5
-rw-r--r--include/caffe/layers/acl_sigmoid_layer.hpp7
-rw-r--r--include/caffe/layers/acl_softmax_layer.hpp9
-rw-r--r--include/caffe/layers/acl_tanh_layer.hpp7
-rw-r--r--src/caffe/acl_layer.cpp289
-rw-r--r--src/caffe/acl_operator.cpp227
-rw-r--r--src/caffe/acl_tensor.cpp138
-rw-r--r--src/caffe/common.cpp16
-rw-r--r--src/caffe/layer_factory.cpp2
-rw-r--r--src/caffe/layers/acl_absval_layer.cpp19
-rw-r--r--src/caffe/layers/acl_base_activation_layer.cpp71
-rw-r--r--src/caffe/layers/acl_base_conv_layer.cpp222
-rw-r--r--src/caffe/layers/acl_batch_norm_layer.cpp140
-rw-r--r--src/caffe/layers/acl_bnll_layer.cpp19
-rw-r--r--src/caffe/layers/acl_concat_layer.cpp97
-rw-r--r--src/caffe/layers/acl_conv_layer.cpp147
-rw-r--r--src/caffe/layers/acl_inner_product_layer.cpp111
-rw-r--r--src/caffe/layers/acl_local_connect_layer.cpp149
-rw-r--r--src/caffe/layers/acl_lrn_layer.cpp84
-rw-r--r--src/caffe/layers/acl_pooling_layer.cpp115
-rw-r--r--src/caffe/layers/acl_relu_layer.cpp35
-rw-r--r--src/caffe/layers/acl_sigmoid_layer.cpp19
-rw-r--r--src/caffe/layers/acl_softmax_layer.cpp64
-rw-r--r--src/caffe/layers/acl_tanh_layer.cpp18
-rw-r--r--tools/extra/tpi.py213
49 files changed, 2591 insertions, 1546 deletions
diff --git a/Makefile b/Makefile
index 2afeae34..8f0c65fa 100644
--- a/Makefile
+++ b/Makefile
@@ -329,7 +329,7 @@ ifeq ($(DEBUG), 1)
COMMON_FLAGS += -DDEBUG -g -O0
NVCCFLAGS += -G
else
- COMMON_FLAGS += -DNDEBUG -O2
+ COMMON_FLAGS += -DNDEBUG -O3
endif
# cuDNN acceleration configuration.
@@ -352,6 +352,10 @@ ifeq ($(USE_ACL), 1)
COMMON_FLAGS += -DUSE_ACL -std=c++11
endif
+ifeq ($(USE_OPENCL), 1)
+ COMMON_FLAGS += -DUSE_OPENCL
+endif
+
#USE_PROFILING -- get profiling informations, is controled by LOGACL
#LAYER_PERF_STAT -- haitao's net profiling information
ifeq ($(USE_PROFILING), 1)
diff --git a/Makefile.config.acl b/Makefile.config.acl
index f20326f9..8bbce0a2 100644
--- a/Makefile.config.acl
+++ b/Makefile.config.acl
@@ -1,18 +1,19 @@
## Refer to http://caffe.berkeleyvision.org/installation.html
# Contributions simplifying and improving our build system are welcome!
-AIDDIR=/usr/local/AID
-export PKG_CONFIG_PATH=${AIDDIR}/opencv3.3.0/lib/pkgconfig
-
# cuDNN acceleration switch (uncomment to build with cuDNN).
# USE_CUDNN := 1
+AIDDIR=/usr/local/AID
+export PKG_CONFIG_PATH=${AIDDIR}/opencv3.3.0/lib/pkgconfig
+
# CPU-only switch (uncomment to build without GPU support).
CPU_ONLY := 1
USE_PROFILING := 0
USE_ACL :=1
+USE_OPENCL:=1
ACL_ROOT :=$(AIDDIR)/ComputeLibrary
ACL_INCS :=$(ACL_ROOT)/include
ACL_INCS +=$(ACL_ROOT)
@@ -143,3 +144,11 @@ TEST_GPUID := 0
# enable pretty build (comment to see full commands)
Q ?= @
+
+ifeq ($(wildcard $(AIDDIR)),)
+ACL_ROOT :=$(shell pwd)/../ComputeLibrary
+ACL_INCS :=$(ACL_ROOT)/include
+ACL_INCS +=$(ACL_ROOT)
+ACL_LIBS_DIR :=$(ACL_ROOT)/build
+OPENCV_VERSION :=
+endif
diff --git a/README.md b/README.md
index c4c1deb0..cd1ffe27 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-
# CaffeOnACL
[![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
@@ -11,9 +10,10 @@ The release version is 0.4.0, is based on [Rockchip RK3399](http://www.rock-chip
* Caffe is a fast open framework for deep learning. See also [Caffe](https://github.com/BVLC/caffe).
### Documents
-* [Installation instructions](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/installation.md)
-* [User Manuals PDF](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/user_manual.pdf)
-* [Performance Report PDF](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/performance_report.pdf)
+* [Installation instructions](acl_openailab/installation.md)
+* [User Manuals PDF](acl_openailab/user_manual.pdf)
+* [Performance Report PDF](acl_openailab/performance_report.pdf)
+* [Accuracy Report PDF](acl_openailab/accuracy_report.pdf)
### Arm Compute Library Compatibility Issues :
There are some compatibility issues between ACL and Caffe Layers, we bypass it to Caffe's original layer class as the workaround solution for the below issues
diff --git a/acl_openailab/accuracy_report.pdf b/acl_openailab/accuracy_report.pdf
new file mode 100644
index 00000000..75d9c08f
--- /dev/null
+++ b/acl_openailab/accuracy_report.pdf
Binary files differ
diff --git a/acl_openailab/installation.md b/acl_openailab/installation.md
index 75bb62bb..f97c1040 100644
--- a/acl_openailab/installation.md
+++ b/acl_openailab/installation.md
@@ -21,10 +21,11 @@ This Installation will help you get started to setup CaffeOnACL on RK3399 quickl
wget --no-check-certificate https://github.com/opencv/opencv/archive/3.3.0.tar.gz
tar -xvf 3.3.0.tar.gz
#### Download "gen-pkg-config-pc"
- wget https://github.com/OAID/AID-tools/raw/master/script/gen-pkg-config-pc.sh
+ wget ftp://ftp.openailab.net/tools/script/gen-pkg-config-pc.sh
+ chmod +x ./gen-pkg-config-pc.sh
#### Download "ACL"
git clone https://github.com/ARM-software/ComputeLibrary.git
- git checkout bf8b01d
+ git checkout 48bc34e
#### Download "CaffeOnACL" :
git clone https://github.com/OAID/CaffeOnACL.git
#### Download "Googletest" :
@@ -45,14 +46,16 @@ This Installation will help you get started to setup CaffeOnACL on RK3399 quickl
mkdir build
aarch64-linux-gnu-gcc opencl-1.2-stubs/opencl_stubs.c -Iinclude -shared -o build/libOpenCL.so
scons Werror=1 -j4 debug=0 asserts=1 neon=1 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a
- wget https://github.com/OAID/AID-tools/raw/master/script/Computelibrary/Makefile
+ wget ftp://ftp.openailab.net/tools/script/Computelibrary/Makefile
sudo make install
+ sudo ~/gen-pkg-config-pc.sh /usr/local/AID
## 3.3 Build Caffe :
cd ~/CaffeOnACL
make all
make distribute
sudo make install
+ sudo ~/gen-pkg-config-pc.sh /usr/local/AID
## 3.4 Build Unit tests
##### Build the gtest libraries
diff --git a/acl_openailab/performance_report.pdf b/acl_openailab/performance_report.pdf
index 7bffc1d8..4cc1b531 100644
--- a/acl_openailab/performance_report.pdf
+++ b/acl_openailab/performance_report.pdf
Binary files differ
diff --git a/acl_openailab/user_manual.pdf b/acl_openailab/user_manual.pdf
index 25530f04..aadfe38e 100644
--- a/acl_openailab/user_manual.pdf
+++ b/acl_openailab/user_manual.pdf
Binary files differ
diff --git a/data/ilsvrc12/get_ilsvrc_aux.sh b/data/ilsvrc12/get_ilsvrc_aux.sh
index dc0d0a72..835212bd 100644
--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@@ -18,4 +18,4 @@ echo "Unzipping..."
tar -xf caffe_ilsvrc12.tar.gz && rm -f caffe_ilsvrc12.tar.gz
-echo "Done."
+echo "Done. " \ No newline at end of file
diff --git a/examples/cpp_classification/classification_profiling_schedule.cpp b/examples/cpp_classification/classification_profiling_schedule.cpp
new file mode 100644
index 00000000..91fff5f7
--- /dev/null
+++ b/examples/cpp_classification/classification_profiling_schedule.cpp
@@ -0,0 +1,547 @@
+#include <caffe/caffe.hpp>
+#ifdef USE_OPENCV
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif // USE_OPENCV
+#include <algorithm>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_PROFILING
+
+#include <iostream>
+
+#include <time.h>
+
+#define REPEAT_TEST
+
+unsigned long get_cur_time(void)
+{
+ struct timespec tm;
+
+ clock_gettime(CLOCK_MONOTONIC_COARSE, &tm);
+
+ return (tm.tv_sec*1000+tm.tv_nsec/1000000);
+}
+
+#endif //USE_PROFILING
+
+#ifdef USE_OPENCV
+using namespace caffe; // NOLINT(build/namespaces)
+using std::string;
+
+/* Pair (label, confidence) representing a prediction. */
+typedef std::pair<string, float> Prediction;
+
+class Classifier {
+ public:
+ Classifier(const string& model_file,
+ const string& trained_file,
+ const string& mean_file,
+ const string& label_file);
+
+ std::vector<Prediction> Classify(const cv::Mat& img, int N = 5);
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+ void dump_perf_stat(void);
+ void dump_single_layer_io(int idx, Layer<float> * p_layer);
+ void dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time);
+#ifdef REPEAT_TEST
+ void collect_layer_stat(vector<vector<perf_stat> * > & all_stat);
+ void dump_all_stat(vector <vector<perf_stat>*>& all_stat);
+ void reset_layer_stat();
+#endif
+#endif
+
+#endif //USE_PROFILING
+
+ private:
+ void SetMean(const string& mean_file);
+
+ std::vector<float> Predict(const cv::Mat& img);
+
+ void WrapInputLayer(std::vector<cv::Mat>* input_channels);
+
+ void Preprocess(const cv::Mat& img,
+ std::vector<cv::Mat>* input_channels);
+
+ private:
+ shared_ptr<Net<float> > net_;
+ cv::Size input_geometry_;
+ int num_channels_;
+ cv::Mat mean_;
+ std::vector<string> labels_;
+};
+
+Classifier::Classifier(const string& model_file,
+ const string& trained_file,
+ const string& mean_file,
+ const string& label_file) {
+#ifdef CPU_ONLY
+ Caffe::set_mode(Caffe::CPU);
+#else
+ Caffe::set_mode(Caffe::GPU);
+#endif
+
+ AclEnableSchedule();
+ /* Load the network. */
+ net_.reset(new Net<float>(model_file, TEST));
+ net_->CopyTrainedLayersFrom(trained_file);
+
+ CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
+ CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
+
+ Blob<float>* input_layer = net_->input_blobs()[0];
+ num_channels_ = input_layer->channels();
+ CHECK(num_channels_ == 3 || num_channels_ == 1)
+ << "Input layer should have 1 or 3 channels.";
+ input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
+
+ /* Load the binaryproto mean file. */
+ SetMean(mean_file);
+
+ /* Load labels. */
+ std::ifstream labels(label_file.c_str());
+ CHECK(labels) << "Unable to open labels file " << label_file;
+ string line;
+ while (std::getline(labels, line))
+ labels_.push_back(string(line));
+
+ Blob<float>* output_layer = net_->output_blobs()[0];
+ CHECK_EQ(labels_.size(), output_layer->channels())
+ << "Number of labels is different from the output layer dimension.";
+}
+
+static bool PairCompare(const std::pair<float, int>& lhs,
+ const std::pair<float, int>& rhs) {
+ return lhs.first > rhs.first;
+}
+
+/* Return the indices of the top N values of vector v. */
+static std::vector<int> Argmax(const std::vector<float>& v, int N) {
+ std::vector<std::pair<float, int> > pairs;
+ for (size_t i = 0; i < v.size(); ++i)
+ pairs.push_back(std::make_pair(v[i], i));
+ std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+ std::vector<int> result;
+ for (int i = 0; i < N; ++i)
+ result.push_back(pairs[i].second);
+ return result;
+}
+
+/* Return the top N predictions. */
+std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) {
+ std::vector<float> output = Predict(img);
+
+ N = std::min<int>(labels_.size(), N);
+ std::vector<int> maxN = Argmax(output, N);
+ std::vector<Prediction> predictions;
+ for (int i = 0; i < N; ++i) {
+ int idx = maxN[i];
+ predictions.push_back(std::make_pair(labels_[idx], output[idx]));
+ }
+
+ return predictions;
+}
+
+/* Load the mean file in binaryproto format. */
+void Classifier::SetMean(const string& mean_file) {
+ BlobProto blob_proto;
+ ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+
+ /* Convert from BlobProto to Blob<float> */
+ Blob<float> mean_blob;
+ mean_blob.FromProto(blob_proto);
+ CHECK_EQ(mean_blob.channels(), num_channels_)
+ << "Number of channels of mean file doesn't match input layer.";
+
+ /* The format of the mean file is planar 32-bit float BGR or grayscale. */
+ std::vector<cv::Mat> channels;
+ float* data = mean_blob.mutable_cpu_data();
+ for (int i = 0; i < num_channels_; ++i) {
+ /* Extract an individual channel. */
+ cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
+ channels.push_back(channel);
+ data += mean_blob.height() * mean_blob.width();
+ }
+
+ /* Merge the separate channels into a single image. */
+ cv::Mat mean;
+ cv::merge(channels, mean);
+
+ /* Compute the global mean pixel value and create a mean image
+ * filled with this value. */
+ cv::Scalar channel_mean = cv::mean(mean);
+ mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean);
+}
+
+std::vector<float> Classifier::Predict(const cv::Mat& img) {
+ Blob<float>* input_layer = net_->input_blobs()[0];
+ input_layer->Reshape(1, num_channels_,
+ input_geometry_.height, input_geometry_.width);
+ /* Forward dimension change to all layers. */
+ net_->Reshape();
+
+ std::vector<cv::Mat> input_channels;
+ WrapInputLayer(&input_channels);
+
+ Preprocess(img, &input_channels);
+
+#ifdef USE_PROFILING
+ unsigned long tstart=get_cur_time();
+#endif //USE_PROFILING
+
+ net_->Forward();
+
+#ifdef USE_PROFILING
+
+ unsigned long tend=get_cur_time();
+
+ std::cout<<"used time: "<<tend-tstart<<std::endl;
+
+#ifdef LAYER_PERF_STAT
+ dump_perf_stat();
+#ifdef REPEAT_TEST
+
+ reset_layer_stat();
+
+ vector<vector<perf_stat>* > all_stat;
+ int rep_number=10;
+
+ for(int i=0;i<rep_number;i++)
+ {
+ net_->Forward();
+ collect_layer_stat(all_stat);
+ reset_layer_stat();
+ }
+
+ //dump stats
+ dump_all_stat(all_stat);
+
+ for(int i=0;i<all_stat.size();i++)
+ delete all_stat[i];
+
+#endif //REPEAT_TEST
+#endif //LAYER_PERF_STAT
+#endif //USE_PROFILING
+
+ /* Copy the output layer to a std::vector */
+ Blob<float>* output_layer = net_->output_blobs()[0];
+ const float* begin = output_layer->cpu_data();
+ const float* end = begin + output_layer->channels();
+ return std::vector<float>(begin, end);
+}
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+
+#ifdef REPEAT_TEST
+void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat)
+{
+ vector<perf_stat > * p_stat;
+ perf_stat * p_time_stat;
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+
+ p_stat=new vector<perf_stat>;
+
+ for (int i =0;i< layers.size(); i++) {
+ p_time_stat=layers[i]->get_time_stat();
+ p_stat->push_back(*p_time_stat);
+
+ }
+
+ all_stat.push_back(p_stat);
+}
+
+void Classifier::reset_layer_stat(void)
+{
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+ perf_stat * p_time_stat;
+
+ for (int i =0;i< layers.size(); i++) {
+ p_time_stat=layers[i]->get_time_stat();
+
+ p_time_stat->count=0;
+ p_time_stat->total=0;
+ p_time_stat->used=p_time_stat->start=p_time_stat->end=0;
+ }
+}
+
+void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat)
+{
+
+ struct new_perf_stat {
+ perf_stat stat;
+ int idx;
+ };
+
+ vector<new_perf_stat > layer_stat;
+ perf_stat * p_stat;
+
+ uint64_t total_time=0;
+
+ layer_stat.resize(all_stat[0]->size());
+
+ for(int i=0;i<all_stat.size();i++)
+ {
+ for(int j=0;j<layer_stat.size();j++)
+ {
+ p_stat=&layer_stat[j].stat;
+
+ p_stat->total+=(*all_stat[i])[j].total;
+ p_stat->count+=(*all_stat[i])[j].count;
+ total_time+=(*all_stat[i])[j].total;
+ }
+ }
+
+ total_time=total_time/all_stat.size();
+
+ std::cout<<std::endl<<"----------------------------------"<<std::endl;
+ std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl;
+ std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl;
+ std::cout<<"Each layer stats: ..."<<std::endl;
+
+
+ for(int i=layer_stat.size()-1;i>=0;i--)
+ {
+ p_stat=&layer_stat[i].stat;
+
+ layer_stat[i].idx=i;
+
+ std::cout<<" "<<i<<": used time: "<<p_stat->total/all_stat.size();
+ std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+ std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+ }
+
+ std::cout<<std::endl;
+
+ std::cout<<"time cost top 10 layers are: ..."<<std::endl;
+
+ std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b)
+ {
+ if(a.stat.total>b.stat.total)
+ return true;
+ else
+ return false;
+ });
+
+ uint64_t top_total_time=0;
+
+ for(int i=0; i<10; i++)
+ {
+ p_stat=&layer_stat[i].stat;
+
+ std::cout<<" "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size();
+ std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+ std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+ top_total_time+=p_stat->total;
+ }
+
+ std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl;
+
+ std::cout<<std::endl;
+}
+
+#endif
+
+void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer)
+{
+ const LayerParameter& layer_param=p_layer->layer_param();
+
+ std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+ std::cout<<" type: "<<layer_param.type()<<std::endl;
+
+ const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom;
+
+ for(int i=0;i<layer_param.bottom_size(); i++)
+ {
+ std::cout<<"bottom "<<layer_param.bottom(i)<<": ";
+
+ Blob<float> * p_blob=(*p_bottom_vec)[i];
+
+ for(int j=0;j<p_blob->num_axes();j++)
+ {
+ std::cout<<p_blob->shape(j)<<" ";
+ }
+ std::cout<<std::endl;
+ }
+
+ const vector<Blob<float>*> *p_top_vec=p_layer->saved_top;
+ for(int i=0;i<layer_param.top_size(); i++)
+ {
+ std::cout<<"top "<<layer_param.top(i)<<": ";
+ Blob<float> * p_blob=(*p_top_vec)[i];
+
+ for(int j=0;j<p_blob->num_axes();j++)
+ {
+ std::cout<<p_blob->shape(j)<<" ";
+ }
+ std::cout<<std::endl;
+ }
+}
+
+void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time)
+{
+ const LayerParameter& layer_param=p_layer->layer_param();
+ perf_stat * p_time_stat;
+
+ p_time_stat=p_layer->get_time_stat();
+
+ std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+ std::cout<<" type: "<<layer_param.type();
+ std::cout<<" ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl;
+
+
+ std::cout<<"time stat: total: "<<p_time_stat->total<<" count: "<<p_time_stat->count;
+ if(p_time_stat->count)
+ {
+ std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count;
+ }
+
+ std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end;
+ std::cout<<std::endl;
+
+
+}
+
+void Classifier::dump_perf_stat(void)
+{
+ uint64_t total_net_time=0;
+
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+ std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl;
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+ dump_single_layer_io(i,layers[i].get());
+ }
+
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+
+ perf_stat * p_time_stat;
+
+ p_time_stat=layers[i]->get_time_stat();
+
+ total_net_time+=p_time_stat->total;
+
+ }
+
+ std::cout<<"Time for each layer ... sum of all layers is : ";
+ std::cout<<total_net_time<<std::endl;
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+
+ dump_single_layer_perf(i,layers[i].get(),total_net_time);
+ }
+
+}
+
+#endif
+
+#endif //USE_PROFILING
+
+/* Wrap the input layer of the network in separate cv::Mat objects
+ * (one per channel). This way we save one memcpy operation and we
+ * don't need to rely on cudaMemcpy2D. The last preprocessing
+ * operation will write the separate channels directly to the input
+ * layer. */
+void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
+ Blob<float>* input_layer = net_->input_blobs()[0];
+
+ int width = input_layer->width();
+ int height = input_layer->height();
+ float* input_data = input_layer->mutable_cpu_data();
+ for (int i = 0; i < input_layer->channels(); ++i) {
+ cv::Mat channel(height, width, CV_32FC1, input_data);
+ input_channels->push_back(channel);
+ input_data += width * height;
+ }
+}
+
+void Classifier::Preprocess(const cv::Mat& img,
+ std::vector<cv::Mat>* input_channels) {
+ /* Convert the input image to the input image format of the network. */
+ cv::Mat sample;
+ if (img.channels() == 3 && num_channels_ == 1)
+ cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
+ else if (img.channels() == 4 && num_channels_ == 1)
+ cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
+ else if (img.channels() == 4 && num_channels_ == 3)
+ cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
+ else if (img.channels() == 1 && num_channels_ == 3)
+ cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
+ else
+ sample = img;
+
+ cv::Mat sample_resized;
+ if (sample.size() != input_geometry_)
+ cv::resize(sample, sample_resized, input_geometry_);
+ else
+ sample_resized = sample;
+
+ cv::Mat sample_float;
+ if (num_channels_ == 3)
+ sample_resized.convertTo(sample_float, CV_32FC3);
+ else
+ sample_resized.convertTo(sample_float, CV_32FC1);
+
+ cv::Mat sample_normalized;
+ cv::subtract(sample_float, mean_, sample_normalized);
+
+ /* This operation will write the separate BGR planes directly to the
+ * input layer of the network because it is wrapped by the cv::Mat
+ * objects in input_channels. */
+ cv::split(sample_normalized, *input_channels);
+
+ CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
+ == net_->input_blobs()[0]->cpu_data())
+ << "Input channels are not wrapping the input layer of the network.";
+}
+
+int main(int argc, char** argv) {
+ if (argc != 6) {
+ std::cerr << "Usage: " << argv[0]
+ << " deploy.prototxt network.caffemodel"
+ << " mean.binaryproto labels.txt img.jpg" << std::endl;
+ return 1;
+ }
+
+ ::google::InitGoogleLogging(argv[0]);
+
+ string model_file = argv[1];
+ string trained_file = argv[2];
+ string mean_file = argv[3];
+ string label_file = argv[4];
+ Classifier classifier(model_file, trained_file, mean_file, label_file);
+
+ string file = argv[5];
+
+ std::cout << "---------- Prediction for "
+ << file << " ----------" << std::endl;
+
+ cv::Mat img = cv::imread(file, -1);
+ CHECK(!img.empty()) << "Unable to decode image " << file;
+ std::vector<Prediction> predictions = classifier.Classify(img);
+
+ /* Print the top N predictions. */
+ for (size_t i = 0; i < predictions.size(); ++i) {
+ Prediction p = predictions[i];
+ std::cout << std::fixed << std::setprecision(4) << p.second << " - \""
+ << p.first << "\"" << std::endl;
+ }
+}
+#else
+int main(int argc, char** argv) {
+ LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
+}
+#endif // USE_OPENCV
diff --git a/include/caffe/acl_layer.hpp b/include/caffe/acl_layer.hpp
deleted file mode 100644
index b188bb8c..00000000
--- a/include/caffe/acl_layer.hpp
+++ /dev/null
@@ -1,278 +0,0 @@
-#ifndef CAFFE_ACL_LAYER_HPP_
-#define CAFFE_ACL_LAYER_HPP_
-
-#ifdef USE_ACL
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
-#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
-#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
-#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
-#define FLAGS_ENABLE_ACL_BNLL 0x00000002
-#define FLAGS_ENABLE_ACL_CONV 0x00000004
-#define FLAGS_ENABLE_ACL_FC 0x00000008
-#define FLAGS_ENABLE_ACL_LRN 0x00000010
-#define FLAGS_ENABLE_ACL_POOLING 0x00000020
-#define FLAGS_ENABLE_ACL_RELU 0x00000040
-#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
-#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
-#define FLAGS_ENABLE_ACL_TANH 0x00000200
-#define FLAGS_ENABLE_ACL_LC 0x00000400
-#define FLAGS_ENABLE_ACL_BN 0x00000800
-#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
-extern unsigned int bypass_acl_class_layer;
-#endif
-#ifdef USE_PROFILING
-#include "layer.hpp"
-
-#define MASK_LOG_APP_TIME 0x00000001
-#define MASK_LOG_ALLOCATE 0x00000002
-#define MASK_LOG_RUN 0x00000004
-#define MASK_LOG_CONFIG 0x00000008
-#define MASK_LOG_COPY 0x00000010
-#define MASK_LOG_ABSVAL 0x00000020
-#define MASK_LOG_BNLL 0x00000040
-#define MASK_LOG_CONV 0x00000080
-#define MASK_LOG_FC 0x00000100
-#define MASK_LOG_LRN 0x00000200
-#define MASK_LOG_POOLING 0x00000400
-#define MASK_LOG_RELU 0x00000800
-#define MASK_LOG_SIGMOID 0x00001000
-#define MASK_LOG_SOFTMAX 0x00002000
-#define MASK_LOG_TANH 0x00004000
-#define MASK_LOG_LC 0x00008000
-#define MASK_LOG_BN 0x00010000
-#define MASK_LOG_CONCAT 0x00020000
-#define APP_TIME_INFO MASK_LOG_APP_TIME,"time: \t"
-#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate: \t\t"
-#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t"
-#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t"
-#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
-#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
-#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t"
-#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t"
-#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t"
-#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t"
-#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_SIGMOID_INFO MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_SOFTMAX_INFO MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_TANH_INFO MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_BN_INFO MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_CONCAT_INFO MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-extern unsigned int acl_log_flags;
-#endif //USE_PROFILING
-namespace caffe {
-#ifdef USE_ACL
-enum TensorType{
- tensor_input,
- tensor_output,
- tensor_weights,
- tensor_biases,
- tensor_data,
-};
-template <typename ACLTensor>
-class BaseTensor:public ACLTensor{
-public:
- BaseTensor(bool share)
- :share_(share),type_(tensor_input),allocate_(false){
- }
- virtual void bindmem(void *mem,bool share){
- mem_=mem;
- share_=share;
- }
- virtual void settensortype(TensorType type){
- type_=type;
- };
- virtual void map(bool blocking = true){}
- virtual void unmap(){}
- virtual void commit(TensorType type=tensor_data);
- int tensor_copy(void * mem, bool toTensor=true);
-protected:
- void* mem_;
- bool share_;
- TensorType type_;
- bool allocate_;
-};
-class GPUTensor:public BaseTensor<CLTensor>{
-public:
- explicit GPUTensor(bool share)
- :BaseTensor(share){}
- virtual void map(bool blocking = true){
- if (!allocate_){
- CLTensor::allocator()->allocate();
- allocate_=true;
- }
- CLTensor::map(blocking);
- }
- virtual void unmap(){
- CLTensor::unmap();
- }
-};
-class CPUTensor:public BaseTensor<Tensor>{
-public:
- explicit CPUTensor(bool share)
- :BaseTensor(share){}
- virtual void map(bool blocking = true){
- if (!allocate_){
- Tensor::allocator()->allocate();
- allocate_=true;
- }
- }
- virtual void unmap(){
- }
-};
-template <typename ACLLayer,typename ACLTensor>
-class ACLXPUBaseLayer{
-public:
- virtual void commit(){
- if (input) {
- input->commit(tensor_input);
- }
- if (output){
- output->commit(tensor_output);
- }
- if (weights){
- weights->commit(tensor_weights);
- }
- if (biases){
- biases->commit(tensor_biases);
- }
- }
- virtual void run(bool gpu){
- commit();
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_RUN_INFO);
-#endif //USE_PROFILING
- layer->run();
- if (gpu) {
- // Make sure all the OpenCL jobs are done executing:
- CLScheduler::get().sync();
- }
- }
- virtual bool reshape(TensorShape &shape,TensorType type);
- explicit ACLXPUBaseLayer(){
- layer=nullptr;
- input=nullptr;
- output=nullptr;
- weights=nullptr;
- biases=nullptr;
- mean=nullptr;
- var=nullptr;
- beta=nullptr;
- gamma=nullptr;
-#ifdef USE_CONV_CACHE
- for(int i = 0; i < 16; ++i){
- cache.layer[i] = nullptr;
- cache.input[i] = nullptr;
- cache.output[i] = nullptr;
- cache.weights[i] = nullptr;
- cache.biases[i] = nullptr;
- }
-#endif //USE_CONV_CACHE
- }
- virtual void freelayer(){
-#ifndef USE_CONV_CACHE
- if (layer) delete layer;
- if (input) delete input;
- if (output) delete output;
- if (weights) delete weights;
- if (biases) delete biases;
- if (mean) delete mean;
- if (var) delete var;
- if (beta) delete beta;
- if (gamma) delete gamma;
-#endif //USE_CONV_CACHE
- layer=nullptr;
- input=nullptr;
- output=nullptr;
- weights=nullptr;
- biases=nullptr;
- mean=nullptr;
- var=nullptr;
- beta=nullptr;
- gamma=nullptr;
- }
- virtual ~ACLXPUBaseLayer(){
- freelayer();
- }
- ACLLayer *layer;
- ACLTensor *input;
- ACLTensor *output;
- ACLTensor *weights;
- ACLTensor *biases;
- //for BN
- ACLTensor *mean;
- ACLTensor *var;
- ACLTensor *beta;
- ACLTensor *gamma;
-#ifdef USE_CONV_CACHE
- struct{
- ACLLayer *layer[16];
- ACLTensor *input[16];
- ACLTensor *output[16];
- ACLTensor *weights[16];
- ACLTensor *biases[16];
- }cache;
-#endif //USE_CONV_CACHE
-};
-template <typename GPULayer, typename CPULayer>
-class ACLBaseLayer {
-public:
- explicit ACLBaseLayer();
- virtual void gpu_run();
- virtual void cpu_run();
- virtual ~ACLBaseLayer();
- virtual GPULayer * new_gpulayer();
- virtual CPULayer * new_cpulayer();
- ACLXPUBaseLayer<GPULayer,GPUTensor>& gpu(){
- return gpu_;
- }
- ACLXPUBaseLayer<CPULayer,CPUTensor>& cpu(){
- return cpu_;
- }
- bool checkreshape(TensorShape shape,bool gpu=false, TensorType type=tensor_input);
- template <typename ACLTensor> bool tensor_mem(ACLTensor *tensor,void *mem,bool share=false);
- template <typename ACLTensor> bool tensor_mem(void *mem,ACLTensor *tensor,bool share=false);
- template <typename ACLTensor> bool new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem=nullptr,bool share=false);
-protected:
- ACLXPUBaseLayer<GPULayer,GPUTensor> gpu_;
- ACLXPUBaseLayer<CPULayer,CPUTensor> cpu_;
- bool init_layer_;
- bool force_bypass_acl_path_;
-
-};
-
-#endif
-}
-#define INSTANTIATE_ACLBASECLASS(GPULayer,CPULayer) \
- template class ACLBaseLayer<GPULayer,CPULayer>;
-
-#define INSTANTIATE_ACLBASE_FUNCTION(GPULayer,CPULayer,ACLTensor) \
- template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share); \
- template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share); \
- template bool ACLBaseLayer<GPULayer,CPULayer>::new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem,bool share); \
-
-
-#endif
diff --git a/include/caffe/acl_operator.hpp b/include/caffe/acl_operator.hpp
new file mode 100644
index 00000000..90051054
--- /dev/null
+++ b/include/caffe/acl_operator.hpp
@@ -0,0 +1,718 @@
+#ifndef CAFFE_ACL_LAYER_HPP_
+#define CAFFE_ACL_LAYER_HPP_
+
+#ifdef USE_ACL
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "acl_tensor.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
+#define FLAGS_ENABLE_ACL_BNLL 0x00000002
+#define FLAGS_ENABLE_ACL_CONV 0x00000004
+#define FLAGS_ENABLE_ACL_FC 0x00000008
+#define FLAGS_ENABLE_ACL_LRN 0x00000010
+#define FLAGS_ENABLE_ACL_POOLING 0x00000020
+#define FLAGS_ENABLE_ACL_RELU 0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
+#define FLAGS_ENABLE_ACL_TANH 0x00000200
+#define FLAGS_ENABLE_ACL_LC 0x00000400
+#define FLAGS_ENABLE_ACL_BN 0x00000800
+#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
+extern unsigned int bypass_acl_class_layer;
+extern unsigned int openailab_intfp;
+#endif
+#ifdef USE_PROFILING
+#include "layer.hpp"
+
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN 0x00000004
+#define MASK_LOG_CONFIG 0x00000008
+#define MASK_LOG_COPY 0x00000010
+#define MASK_LOG_ABSVAL 0x00000020
+#define MASK_LOG_BNLL 0x00000040
+#define MASK_LOG_CONV 0x00000080
+#define MASK_LOG_FC 0x00000100
+#define MASK_LOG_LRN 0x00000200
+#define MASK_LOG_POOLING 0x00000400
+#define MASK_LOG_RELU 0x00000800
+#define MASK_LOG_SIGMOID 0x00001000
+#define MASK_LOG_SOFTMAX 0x00002000
+#define MASK_LOG_TANH 0x00004000
+#define MASK_LOG_LC 0x00008000
+#define MASK_LOG_BN 0x00010000
+#define MASK_LOG_CONCAT 0x00020000
+#define APP_TIME_INFO MASK_LOG_APP_TIME,"time: \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate: \t\t"
+#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t"
+#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t"
+#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_BN_INFO MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_CONCAT_INFO MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+#endif //USE_PROFILING
+namespace caffe {
+#ifdef USE_ACL
+enum TensorType{
+ tensor_input,
+ tensor_output,
+ tensor_weights,
+ tensor_biases,
+ tensor_mean,
+ tensor_var,
+ tensor_beta,
+ tensor_gamma,
+ tensor_concat,
+ tensor_data,
+};
+enum OperatorState{
+ operator_not_init,
+ operator_init_done,
+ operator_reinit,
+};
+enum OperateType{
+ operate_type_pooling,
+ operate_type_activation,
+ operate_type_lrn,
+ operate_type_conv,
+ operate_type_lc,
+ operate_type_fc,
+ operate_type_bn,
+ operate_type_softmax,
+ operate_type_concat,
+};
+class BaseACLTensor{
+public:
+ BaseACLTensor()
+ :type_(tensor_input),allocate_(false){
+ }
+ virtual void bindmem(void *mem){
+ mem_=mem;
+ }
+ virtual void settensortype(TensorType type){
+ type_=type;
+ };
+ virtual void map(bool blocking = true){
+ }
+ virtual void unmap(){}
+ virtual void commit(TensorType type=tensor_data){}
+ int tensor_copy(arm_compute::ITensor* tensor,void * mem, bool toTensor=true);
+protected:
+ void* mem_;
+ TensorType type_;
+ bool allocate_;
+};
+class ACLTensor:public BaseACLTensor,public Tensor{
+public:
+ ACLTensor(arm_compute::TensorInfo &&info)
+ :Tensor(info){
+ }
+ virtual void map(bool blocking = true){
+ if (!allocate_){
+ Tensor::allocate();
+ allocate_=true;
+ }
+ Tensor::map(blocking);
+ }
+ virtual int tensor_copy(void * mem, bool toTensor=true){
+ auto acl_tensor=this;
+ arm_compute::ITensor* tensor=acl_tensor->tensor();
+ BaseACLTensor::tensor_copy(tensor,mem,toTensor);
+ return 0;
+ }
+ virtual void unmap(){Tensor::unmap();}
+ virtual void commit(TensorType type=tensor_data);
+};
+class ACLSubTensor:public BaseACLTensor,public SubTensor{
+public:
+ ACLSubTensor(std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord)
+ :SubTensor(parent.get(),shape,coord){
+ }
+ virtual int tensor_copy(void * mem, bool toTensor=true){
+ return 0;
+ }
+};
+
+template <typename T>
+class TensorPair{
+public:
+ TensorPair(){}
+ ~TensorPair(){}
+ TensorType type;
+ std::unique_ptr<T> tensor;
+};
+template <typename T>
+std::unique_ptr<T> &tensor_item(std::vector<std::unique_ptr<TensorPair<T>>>& pool,TensorType type,int idx){
+ int count=0;
+ for (auto &item: pool) {
+ if(item.get()->type==type){
+ ++count;
+ }
+ if(item.get()->type==type && idx==count-1){
+ return item.get()->tensor;
+ }
+ }
+ pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
+ auto item=pool[pool.size()-1].get();
+ item->type=type;
+ item->tensor=NULL;
+ return item->tensor;
+}
+class ACLOperator {
+public:
+ virtual void commit(){
+ for (auto & item: tensor_pool_) {
+ if(item.get()->tensor)item.get()->tensor->commit(item.get()->type);
+ }
+ }
+ inline void run(){
+ commit();
+ #ifdef USE_PROFILING
+ logtime_util log_time(ACL_RUN_INFO);
+ #endif //USE_PROFILING
+ for(auto &c : funcs_)
+ {
+ c->run();
+ }
+ }
+
+ inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs(){return funcs_;}
+
+ inline std::unique_ptr<ACLSubTensor> &sinput(int idx=0){return tensor_item(subtensor_pool_,tensor_input,idx);}
+ inline std::unique_ptr<ACLSubTensor> &soutput(int idx=0){return tensor_item(subtensor_pool_,tensor_output,idx);}
+ inline std::unique_ptr<ACLSubTensor> &sweights(int idx=0){return tensor_item(subtensor_pool_,tensor_weights,idx);}
+ inline std::unique_ptr<ACLSubTensor> &sbiases(int idx=0){return tensor_item(subtensor_pool_,tensor_biases,idx);}
+
+ inline std::unique_ptr<ACLTensor> &cinput(int idx=0){return tensor_item(tensor_pool_,tensor_concat,idx);}
+ inline std::unique_ptr<ACLTensor> &input(int idx=0){return tensor_item(tensor_pool_,tensor_input,idx);}
+ inline std::unique_ptr<ACLTensor> &output(int idx=0){return tensor_item(tensor_pool_,tensor_output,idx);}
+ inline std::unique_ptr<ACLTensor> &weights(int idx=0){return tensor_item(tensor_pool_,tensor_weights,idx);}
+ inline std::unique_ptr<ACLTensor> &biases(int idx=0){return tensor_item(tensor_pool_,tensor_biases,idx);}
+ inline std::unique_ptr<ACLTensor> &mean(int idx=0){return tensor_item(tensor_pool_,tensor_mean,idx);}
+ inline std::unique_ptr<ACLTensor> &var(int idx=0){return tensor_item(tensor_pool_,tensor_var,idx);}
+ inline std::unique_ptr<ACLTensor> &beta(int idx=0){return tensor_item(tensor_pool_,tensor_beta,idx);}
+ inline std::unique_ptr<ACLTensor> &gamma(int idx=0){return tensor_item(tensor_pool_,tensor_gamma,idx);}
+ inline std::unique_ptr<ACLTensor> &tensor(TensorType type){
+ switch (type) {
+ case tensor_biases:
+ return biases();
+ break;
+ case tensor_weights:
+ return weights();
+ break;
+ case tensor_output:
+ return output();
+ break;
+ default:
+ case tensor_input:
+ return input();
+ break;
+ }
+ return input();
+ }
+
+
+ explicit ACLOperator(const LayerParameter& param);
+ virtual ~ACLOperator();
+ inline TargetHint getTargetHint(){
+#ifdef USE_OPENCL
+ if (target_hint_==TargetHint::DONT_CARE) {
+ if (Caffe::arm_gpu_mode()) {
+ return TargetHint::OPENCL;
+ }
+ return TargetHint::NEON;
+ }
+ return target_hint_;
+#else
+ return TargetHint::NEON;
+#endif
+ }
+ inline void setTargetHint(TargetHint hint){
+ target_hint_=hint;
+ }
+ inline ConvolutionMethodHint & getConvMethod(){ return convolution_method_hint_;}
+ inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor,void *mem){
+ tensor->bindmem(mem);
+ return true;
+ }
+ inline bool tensor_mem(void *mem,std::unique_ptr<ACLTensor> &tensor){
+ tensor->tensor_copy(mem,false);
+ return true;
+ }
+ bool new_tensor(std::unique_ptr<ACLTensor> &tensor,arm_compute::TensorShape &shape,void *mem=nullptr,bool commit=false);
+ bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord);
+ inline int & group(){return _group;}
+ inline void set_operator_property(OperateType type,const char*name){
+ name_=name;
+ type_=type;
+ }
+ inline void acl_run(void *input_data, void *output_data){
+ if(input_data)tensor_mem(input(),input_data);
+ run();
+ tensor_mem(output_data,output());
+ }
+
+
+protected:
+ inline bool isGPUMode(){
+ if (!support_opencl_) return false;
+ return getTargetHint()==TargetHint::OPENCL;
+ }
+ inline OperatorState & opstate(){return operator_state_;}
+ inline bool is_operator_init_done(arm_compute::TensorShape shape,TensorType type=tensor_input){
+ checkreshape(shape,type);
+ return operator_state_==operator_init_done;
+ }
+ inline void set_operator_init_done(){
+ opstate()=operator_init_done;
+ set_bypass_state(false);
+ }
+ inline void set_bypass_state(bool state=false){
+ force_bypass_acl_path_=state;
+ }
+ inline OperatorState checkreshape(arm_compute::TensorShape shape,TensorType type=tensor_input){
+ opstate()=reshape(shape,type);
+ if (opstate()==operator_reinit) {
+ freeres();
+ }
+ return opstate();
+ }
+ inline OperatorState reshape(arm_compute::TensorShape &shape,TensorType type){
+ arm_compute::TensorShape _shape;
+ std::unique_ptr<ACLTensor> &acl_tensor=tensor(type);
+ if (!acl_tensor.get()) return operator_not_init;
+ _shape = acl_tensor->info().tensor_shape();
+ if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
+ return operator_init_done;
+ }
+ return operator_reinit;
+ }
+ inline void freeres(){
+ tensor_pool_.clear();
+ subtensor_pool_.clear();
+ funcs_.clear();
+ }
+ inline const char* &name(){return name_;}
+
+protected:
+ std::vector<std::unique_ptr<TensorPair<ACLTensor>>>tensor_pool_;
+ std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>>subtensor_pool_;
+ std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
+ OperatorState operator_state_;
+ bool force_bypass_acl_path_;
+ TargetHint target_hint_;
+ ConvolutionMethodHint convolution_method_hint_;
+ static bool support_opencl_;
+ static bool init_cl_env;
+ int _group;
+ const char* name_;
+ OperateType type_;
+};
+
+int isScheduleEnable();
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output){
+ auto op = cpp14::make_unique<OperatorType>();
+ op->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output)
+ );
+
+ return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *output)
+{
+ return instantiate_function<OperatorType, TensorType>(input, output);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input, std::unique_ptr<ACLTensor>& output,TargetHint& hint){
+ std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+ if(hint == TargetHint::OPENCL)
+ {
+ func = instantiate<GPUOpType, GPUTensor>(input->tensor(), output->tensor());
+ }
+ else
+#endif
+ {
+ func = instantiate<CPUOpType, CPUTensor>(input->tensor(), output->tensor());
+ }
+ return func;
+}
+
+
+template <typename OperatorType, typename TensorType,typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(VectorTensor inputs, arm_compute::ITensor *output){
+ auto op = cpp14::make_unique<OperatorType>();
+ op->configure(
+ inputs,
+ dynamic_cast<TensorType *>(output)
+ );
+
+ return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType,typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate(VectorTensor inputs, arm_compute::ITensor *output)
+{
+ return instantiate_function<OperatorType, TensorType,VectorTensor>(inputs, output);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(ACLOperator*& acl_op, std::unique_ptr<ACLTensor>& output,int num,TargetHint& hint){
+ std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+ if(hint == TargetHint::OPENCL)
+ {
+ static std::vector<arm_compute::ICLTensor*> tensors;
+ tensors.clear();
+ for (int i=0;i<num;++i) {
+ tensors.push_back(dynamic_cast<arm_compute::ICLTensor*>(acl_op->cinput(i).get()->tensor()));
+ }
+ func = instantiate<GPUOpType, GPUTensor, std::vector<arm_compute::ICLTensor *>>(tensors, output->tensor());
+ }
+ else
+#endif
+ {
+ static std::vector<arm_compute::ITensor*> tensors;
+ tensors.clear();
+ for (int i=0;i<num;++i) {
+ tensors.push_back(dynamic_cast<arm_compute::ITensor*>(acl_op->cinput(i).get()->tensor()));
+ }
+ func = instantiate<CPUOpType, CPUTensor,std::vector<arm_compute::ITensor*>>(tensors, output->tensor());
+ }
+ return func;
+}
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output, const OperatorInfo &info){
+ auto op = cpp14::make_unique<OperatorType>();
+ op->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ info);
+
+ return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *output, const OperatorInfo &info)
+{
+ return instantiate_function<OperatorType, TensorType, OperatorInfo>(input, output, info);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input, std::unique_ptr<ACLTensor>& output, const OperatorInfo &info,TargetHint& hint){
+ std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+ if(hint == TargetHint::OPENCL)
+ {
+ func = instantiate<GPUOpType, GPUTensor,OperatorInfo>(input->tensor(), output->tensor(), info);
+ }
+ else
+#endif
+ {
+ func = instantiate<CPUOpType, CPUTensor,OperatorInfo>(input->tensor(), output->tensor(), info);
+ }
+ return func;
+}
+
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input,arm_compute::ITensor *weights,arm_compute::ITensor *biases, arm_compute::ITensor *output, const OperatorInfo &info){
+ auto op = cpp14::make_unique<OperatorType>();
+ op->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(weights),
+ dynamic_cast<TensorType *>(biases),
+ dynamic_cast<TensorType *>(output),
+ info);
+ return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input,arm_compute::ITensor *weights,arm_compute::ITensor *biases, arm_compute::ITensor *output, const OperatorInfo &info)
+{
+ return instantiate_function<OperatorType, TensorType, OperatorInfo>(input,weights,biases,output, info);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor, typename OperatorInfo,typename ACLTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input,std::unique_ptr<ACLTensor>& weights,std::unique_ptr<ACLTensor>& biases, std::unique_ptr<ACLTensor>& output, const OperatorInfo &info,TargetHint& hint){
+ std::unique_ptr<arm_compute::IFunction> func;
+ arm_compute::ITensor * biases_tensor=NULL;
+
+ if (biases.get()) {
+ biases_tensor=biases->tensor();
+ }
+#ifdef USE_OPENCL
+ if (hint == TargetHint::OPENCL)
+ {
+ func = instantiate<GPUOpType, GPUTensor,OperatorInfo>(input->tensor(), weights->tensor(),biases_tensor,output->tensor(), info);
+ }
+ else
+#endif
+ {
+ func = instantiate<CPUOpType, CPUTensor,OperatorInfo>(input->tensor(), weights->tensor(),biases_tensor, output->tensor(), info);
+ }
+ return func;
+}
+
+
+
+template <typename Dtype,typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output,
+ arm_compute::ITensor *mean,arm_compute::ITensor *var,arm_compute::ITensor *beta,arm_compute::ITensor *gamma,Dtype & eps){
+ auto op = cpp14::make_unique<OperatorType>();
+ op->configure(
+ dynamic_cast<TensorType *>(input),
+ dynamic_cast<TensorType *>(output),
+ dynamic_cast<TensorType *>(mean),
+ dynamic_cast<TensorType *>(var),
+ dynamic_cast<TensorType *>(beta),
+ dynamic_cast<TensorType *>(gamma),
+ eps);
+
+ return std::move(op);
+}
+
+template <typename Dtype,typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor * input,arm_compute::ITensor * output,
+ arm_compute::ITensor * mean,arm_compute::ITensor * var,arm_compute::ITensor * beta,arm_compute::ITensor * gamma,Dtype eps){
+ return instantiate_function<Dtype,OperatorType, TensorType>(input,output, mean,var,beta,gamma,eps);
+}
+
+template <typename Dtype,typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input,std::unique_ptr<ACLTensor>& output,
+ std::unique_ptr<ACLTensor>& mean,std::unique_ptr<ACLTensor>& var,std::unique_ptr<ACLTensor>& beta,std::unique_ptr<ACLTensor>& gamma,Dtype eps,TargetHint hint){
+ std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+ if(hint == TargetHint::OPENCL)
+ {
+ func = instantiate<Dtype,GPUOpType, GPUTensor>(input->tensor(),output->tensor(), mean->tensor(),var->tensor(),beta->tensor(),gamma->tensor(),eps);
+ }
+ else
+#endif
+ {
+ func = instantiate<Dtype,CPUOpType, CPUTensor>(input->tensor(),output->tensor(), mean->tensor(),var->tensor(),beta->tensor(),gamma->tensor(),eps);
+ }
+ return func;
+}
+
+
+template <typename OperatorInfo>
+bool instantiate_op_pooling(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input, std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){
+ func.push_back(instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor, arm_compute::NEPoolingLayer, arm_compute::ITensor, arm_compute::PoolingLayerInfo>(input, output, info, hint));
+ return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_activation(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){
+ func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,arm_compute::ICLTensor,arm_compute::NEActivationLayer,arm_compute::ITensor, arm_compute::ActivationLayerInfo>(input, output, info, hint));
+ return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lrn(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){
+ func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,arm_compute::ICLTensor,arm_compute::NENormalizationLayer,arm_compute::ITensor, arm_compute::NormalizationLayerInfo>(input, output, info, hint));
+ return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_conv(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint,const OperatorInfo &info){
+ std::unique_ptr<ACLTensor> & weights=acl_op->weights();
+ std::unique_ptr<ACLTensor> & biases=acl_op->biases();
+ ConvolutionMethodHint& conv_method=acl_op->getConvMethod();
+ bool has_biases=biases.get()?true:false;
+ int& groups=acl_op->group();
+ arm_compute::TensorShape input_shape=input->info().tensor_shape();
+ arm_compute::TensorShape weights_shape=weights->info().tensor_shape();
+ arm_compute::TensorShape biases_shape;
+ if (has_biases) {
+ biases_shape = biases->info().tensor_shape();
+ }
+ arm_compute::TensorShape output_shape=output->info().tensor_shape();
+
+ if (groups==1) {
+ if (conv_method == ConvolutionMethodHint::GEMM) {
+ func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo>(acl_op->input(), acl_op->weights(), acl_op->biases(), acl_op->output(), info, hint));
+ }else{
+ func.push_back(instantiate_op_func<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo>(acl_op->input(), acl_op->weights(), acl_op->biases(), acl_op->output(), info, hint));
+ }
+ return true;
+ }
+
+ // Calculate sub-tensor splits
+ const int input_split = input_shape.z() / groups;
+ const int output_split = output_shape.z() / groups;
+ const int weights_split = weights_shape[3] / groups;
+ const int biases_split = biases_shape.x() / groups;
+
+ // Calculate sub-tensor shapes
+ input_shape.set(2, input_split);
+ output_shape.set(2, output_split);
+ weights_shape.set(3, weights_split);
+ biases_shape.set(0, biases_split);
+
+ for (auto i = 0; i < groups; ++i) {
+ // Calculate sub-tensors starting coordinates
+ arm_compute::Coordinates input_coord(0, 0, input_split * i);
+ arm_compute::Coordinates output_coord(0, 0, output_split * i);
+ arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
+ arm_compute::Coordinates biases_coord(biases_split * i);
+
+ // Create sub-tensors for input, output, weights and bias
+ acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape, input_coord);
+ acl_op->new_tensor(acl_op->soutput(i),acl_op->output(),output_shape, output_coord);
+ acl_op->new_tensor(acl_op->sweights(i),acl_op->weights(), weights_shape, weights_coord);
+ if (has_biases) {
+ acl_op->new_tensor(acl_op->sbiases(i),acl_op->biases(), biases_shape, biases_coord);
+ }
+
+ if (conv_method == ConvolutionMethodHint::GEMM) {
+ func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo,ACLSubTensor>(acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), acl_op->soutput(i), info, hint));
+ }else{
+ func.push_back(instantiate_op_func<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo,ACLSubTensor>(acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), acl_op->soutput(i), info, hint));
+ }
+ }
+ return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lc(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){
+ std::unique_ptr<ACLTensor> & weights=acl_op->weights();
+ std::unique_ptr<ACLTensor> & biases=acl_op->biases();
+ func.push_back(instantiate_op_func<arm_compute::CLLocallyConnectedLayer,arm_compute::ICLTensor,arm_compute::NELocallyConnectedLayer,arm_compute::ITensor, arm_compute::PadStrideInfo>(input, weights,biases,output,info, hint));
+ return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_fc(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){
+ std::unique_ptr<ACLTensor> & weights=acl_op->weights();
+ std::unique_ptr<ACLTensor> & biases=acl_op->biases();
+ func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,arm_compute::ICLTensor,arm_compute::NEFullyConnectedLayer,arm_compute::ITensor, bool>(input, weights,biases,output,info, hint));
+ return true;
+}
+template <typename Dtype>
+bool instantiate_op_bn(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, Dtype eps){
+ std::unique_ptr<ACLTensor> & mean=acl_op->mean();
+ std::unique_ptr<ACLTensor> & var=acl_op->var();
+ std::unique_ptr<ACLTensor> & beta=acl_op->beta();
+ std::unique_ptr<ACLTensor> & gamma=acl_op->gamma();
+ func.push_back(instantiate_op_func<Dtype,arm_compute::CLBatchNormalizationLayer,arm_compute::ICLTensor,arm_compute::NEBatchNormalizationLayer,arm_compute::ITensor>(input, output, mean,var,beta,gamma,eps, hint));
+ return true;
+}
+inline bool instantiate_op_softmax(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint,void *data){
+ func.push_back(instantiate_op_func<arm_compute::CLSoftmaxLayer,arm_compute::ICLTensor,arm_compute::NESoftmaxLayer,arm_compute::ITensor>(input, output, hint));
+ return true;
+}
+inline bool instantiate_op_concat(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint,int num){
+ func.push_back(instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,arm_compute::ICLTensor,arm_compute::NEDepthConcatenateLayer,arm_compute::ITensor>(acl_op, output, num,hint));
+ return true;
+}
+template <typename Dtype>
+Dtype* GetDataPtr(ACLOperator* op,Blob<Dtype>* const &blob,bool isconst=false){
+ if (!isconst) {
+ if (op->getTargetHint() == TargetHint::NEON) {
+ return blob->mutable_cpu_data();
+ }
+ return blob->mutable_gpu_data();
+ }
+ if (op->getTargetHint()==TargetHint::NEON) {
+ return (Dtype*)blob->cpu_data();
+ }
+ return (Dtype*)blob->gpu_data();
+}
+
+template <typename Dtype>
+Dtype* InputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,int index=-1){
+ if (index==-1) index=0;
+ return GetDataPtr(op, bottom[index], true);
+}
+template <typename Dtype>
+Dtype* OutputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& top){
+ return GetDataPtr(op,top[0]);
+}
+
+template <typename Dtype>
+void acl_run(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top,bool multi_input_run=true){
+ if (multi_input_run) {
+ for (int i = 0; i < bottom.size(); ++i) {
+ const Dtype* bottom_data = bottom[i]->cpu_data();
+ Dtype* top_data = top[i]->mutable_cpu_data();
+ op->acl_run((void*)bottom_data,(void*)top_data);
+ }
+ return ;
+ }
+ for (int i = 0; i < bottom.size(); ++i) {
+ op->tensor_mem(op->cinput(i),InputdataPtr(op,bottom,i));
+ }
+ op->acl_run(NULL,OutputdataPtr(op,top));
+}
+}
+
+#define INIT_GLOBAL_FUNCS_TYPE(Dtype) \
+template <> \
+Dtype* InputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,int index); \
+template <> \
+Dtype* OutputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& top); \
+template <> \
+Dtype* GetDataPtr(ACLOperator* op,Blob<Dtype>* const & blob,bool isconst); \
+
+#define INIT_GLOBAL_FUNCS() \
+INIT_GLOBAL_FUNCS_TYPE(double); \
+INIT_GLOBAL_FUNCS_TYPE(float); \
+
+
+#ifdef USE_PROFILING
+#define acl_configure(opname,acl_op,args...)\
+{\
+ set_operator_property(operate_type_##opname,#opname); \
+ logtime_util log_time(ACL_CONFIG_INFO); \
+ instantiate_op_##opname(acl_op,acl_op->funcs(),acl_op->input(),acl_op->output(),acl_op->getTargetHint(),args);\
+}
+#else
+#define acl_configure(opname,acl_op,args...)\
+{\
+ set_operator_property(operate_type_##opname,#opname); \
+ instantiate_op_##opname(acl_op,acl_op->funcs(),acl_op->input(),acl_op->output(),acl_op->getTargetHint(),args);\
+}
+#endif
+
+#endif
+
+#endif
diff --git a/include/caffe/acl_tensor.hpp b/include/caffe/acl_tensor.hpp
new file mode 100644
index 00000000..89466de3
--- /dev/null
+++ b/include/caffe/acl_tensor.hpp
@@ -0,0 +1,114 @@
+#ifndef __TENSOR_H__
+#define __TENSOR_H__
+
+#ifdef USE_ACL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace caffe{
+enum class TargetHint{
+ DONT_CARE,
+ OPENCL,
+ NEON,
+};
+
+enum class ConvolutionMethodHint{
+ GEMM,
+ DIRECT,
+};
+namespace cpp14{
+template <class T>
+struct _Unique_if{
+ typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]>{
+ typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]>{
+ typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object
+make_unique(Args &&... args){
+ return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound
+make_unique(size_t n){
+ typedef typename std::remove_extent<T>::type U;
+ return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound
+make_unique(Args &&...) ;
+}
+
+class Tensor {
+public:
+ Tensor(arm_compute::TensorInfo &info) noexcept;
+ ~Tensor(){
+ }
+ Tensor(Tensor &&src) noexcept ;
+ void set_info(arm_compute::TensorInfo &&info){
+ _info = info;
+ }
+ arm_compute::ITensor *set_target(TargetHint target);
+ const arm_compute::TensorInfo &info() const{
+ return _info;
+ }
+ arm_compute::ITensor * tensor(){
+ return _tensor.get();
+ }
+ void allocate();
+ void init(){
+
+ }
+ TargetHint target() const{
+ return _target;
+ }
+ virtual void map(bool blocking = true);
+ virtual void unmap();
+
+private:
+ TargetHint _target;
+ arm_compute::TensorInfo _info;
+ std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+
+class SubTensor
+{
+public:
+ SubTensor(Tensor* parent, arm_compute::TensorShape& tensor_shape, arm_compute::Coordinates& coords)noexcept;
+ ~SubTensor(){}
+ arm_compute::ITensor *tensor() ;
+ const arm_compute::ITensor *tensor() const ;
+ TargetHint target() const ;
+ void allocate() ;
+ arm_compute::ITensor *set_target(TargetHint target);
+
+private:
+ /** Instantiates a sub-tensor */
+ void instantiate_subtensor();
+
+private:
+ TargetHint _target; /**< Target that this tensor is pinned on */
+ arm_compute::TensorShape _tensor_shape; /**< SubTensor shape */
+ arm_compute::Coordinates _coords; /**< SubTensor Coordinates */
+ arm_compute::ITensor *_parent; /**< Parent tensor */
+ std::unique_ptr<arm_compute::ITensor> _subtensor; /**< SubTensor */
+};
+
+}
+#endif
+#endif //__TENSOR_H__
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 49b1e695..4ffeb68c 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -22,6 +22,7 @@ extern unsigned int acl_log_flags;
namespace boost { class mutex; }
namespace caffe {
+bool AclEnableSchedule(int enable=1);
#ifdef USE_PROFILING
class logtime_util
{
diff --git a/include/caffe/layers/acl_absval_layer.hpp b/include/caffe/layers/acl_absval_layer.hpp
index c1655404..9eba67de 100644
--- a/include/caffe/layers/acl_absval_layer.hpp
+++ b/include/caffe/layers/acl_absval_layer.hpp
@@ -11,7 +11,7 @@
#include "caffe/layers/absval_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#include "caffe/layers/acl_base_activation_layer.hpp"
#endif
@@ -46,9 +46,10 @@ class ACLAbsValLayer : public ACLBaseActivationLayer<Dtype>,public AbsValLayer<D
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_base_activation_layer.hpp b/include/caffe/layers/acl_base_activation_layer.hpp
index e2abdafa..39643f33 100644
--- a/include/caffe/layers/acl_base_activation_layer.hpp
+++ b/include/caffe/layers/acl_base_activation_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/neuron_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#include "caffe/layers/acl_base_activation_layer.hpp"
#endif
@@ -23,9 +23,9 @@ namespace caffe {
* Fallback to BNLLLayer for some corner cases.
*/
template <typename Dtype>
-class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivationLayer> {
+class ACLBaseActivationLayer : public ACLOperator {
public:
- explicit ACLBaseActivationLayer(const LayerParameter& param)
+ explicit ACLBaseActivationLayer(const LayerParameter& param):ACLOperator(param)
{}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
@@ -46,8 +46,8 @@ class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivatio
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type=ActivationLayerInfo::ActivationFunction::RELU);
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type=arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
};
#endif
diff --git a/include/caffe/layers/acl_base_conv_layer.hpp b/include/caffe/layers/acl_base_conv_layer.hpp
deleted file mode 100644
index 6b38eb28..00000000
--- a/include/caffe/layers/acl_base_conv_layer.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef CAFFE_ACL_BASE_CONV_LAYER_HPP_
-#define CAFFE_ACL_BASE_CONV_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/conv_layer.hpp"
-
-#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
-#endif
-
-namespace caffe {
-
-#ifdef USE_ACL
-/*
- * @brief ACL implementation of ConvolutionLayer.
- * Fallback to ConvolutionLayer for some corner cases.
- *
-*/
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-class ACLConvolutionLayer : public ACLBaseLayer<GPUConvLayer,CPUConvLayer>,public ConvolutionLayer<Dtype> {
- public:
- explicit ACLConvolutionLayer(const LayerParameter& param)
- : ConvolutionLayer<Dtype>(param) {}
- virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top);
- virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top);
- virtual ~ACLConvolutionLayer();
-
- protected:
- virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top);
- virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top);
- virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
- const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
- NOT_IMPLEMENTED;
- }
- virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
- const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
- NOT_IMPLEMENTED;
- }
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top);
-
-};
-#endif
-
-} // namespace caffe
-
-// Instantiate a class with float and double specifications.
-#define INSTANTIATE_CONV_CLASS(classname,GPUConvLayer,CPUConvLayer) \
- template class classname<float,GPUConvLayer,CPUConvLayer>; \
- template class classname<double,GPUConvLayer,CPUConvLayer>
-
-#endif // CAFFE_ACL_BASE_CONV_LAYER_HPP_
diff --git a/include/caffe/layers/acl_batch_norm_layer.hpp b/include/caffe/layers/acl_batch_norm_layer.hpp
index e899804f..97dcab3d 100644
--- a/include/caffe/layers/acl_batch_norm_layer.hpp
+++ b/include/caffe/layers/acl_batch_norm_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/batch_norm_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
* Fallback to BatchNormLayer for some corner cases.
*/
template <typename Dtype>
-class ACLBatchNormLayer : public ACLBaseLayer<CLBatchNormalizationLayer,NEBatchNormalizationLayer>,public BatchNormLayer<Dtype> {
+class ACLBatchNormLayer : public ACLOperator,public BatchNormLayer<Dtype> {
public:
explicit ACLBatchNormLayer(const LayerParameter& param)
- : BatchNormLayer<Dtype>(param) {}
+ : ACLOperator(param),BatchNormLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLBatchNormLayer : public ACLBaseLayer<CLBatchNormalizationLayer,NEBatchN
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_bnll_layer.hpp b/include/caffe/layers/acl_bnll_layer.hpp
index ea2f8a16..9cf607e1 100644
--- a/include/caffe/layers/acl_bnll_layer.hpp
+++ b/include/caffe/layers/acl_bnll_layer.hpp
@@ -11,7 +11,7 @@
#include "caffe/layers/bnll_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#include "caffe/layers/acl_base_activation_layer.hpp"
#endif
@@ -47,8 +47,9 @@ class ACLBNLLLayer : public ACLBaseActivationLayer<Dtype>,public BNLLLayer<Dtype
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_concat_layer.hpp b/include/caffe/layers/acl_concat_layer.hpp
index 90212192..bc1c9173 100644
--- a/include/caffe/layers/acl_concat_layer.hpp
+++ b/include/caffe/layers/acl_concat_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/concat_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
* Fallback to ConcatLayer for some corner cases.
*/
template <typename Dtype>
-class ACLConcatLayer : public ACLBaseLayer<CLDepthConcatenate,NEDepthConcatenate>,public ConcatLayer<Dtype> {
+class ACLConcatLayer : public ACLOperator,public ConcatLayer<Dtype> {
public:
explicit ACLConcatLayer(const LayerParameter& param)
- : ConcatLayer<Dtype>(param) {}
+ : ACLOperator(param),ConcatLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,11 +44,9 @@ class ACLConcatLayer : public ACLBaseLayer<CLDepthConcatenate,NEDepthConcatenate
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
- private:
- std::vector<ITensor *> cpu_vectors;
- std::vector<ICLTensor *> gpu_vectors;
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_conv_layer.hpp b/include/caffe/layers/acl_conv_layer.hpp
index 2fd795c9..21ca6aa6 100644
--- a/include/caffe/layers/acl_conv_layer.hpp
+++ b/include/caffe/layers/acl_conv_layer.hpp
@@ -1,46 +1,56 @@
#ifndef CAFFE_ACL_CONV_LAYER_HPP_
#define CAFFE_ACL_CONV_LAYER_HPP_
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/conv_layer.hpp"
+
#ifdef USE_ACL
-#include "caffe/layers/acl_base_conv_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
-extern bool use_direct_conv_;
#ifdef USE_ACL
+/*
+ * @brief ACL implementation of ConvolutionLayer.
+ * Fallback to ConvolutionLayer for some corner cases.
+ *
+*/
template <typename Dtype>
-inline shared_ptr<Layer<Dtype> > GetACLConvolutionLayer(
- const LayerParameter& param) {
- ConvolutionParameter conv_param = param.convolution_param();
- const char* pDirectConv;
- pDirectConv = getenv ("DIRECTCONV");
- if (pDirectConv){
- unsigned int bdirectconv;
- sscanf(pDirectConv,"%i", &bdirectconv);
- if(bdirectconv != use_direct_conv_){
- use_direct_conv_ = bdirectconv;
- printf("DIRECTCONV<%s>\n", pDirectConv);
- printf("DIRECTCONV: %x\n", use_direct_conv_);
+class ACLConvolutionLayer : public ACLOperator,public ConvolutionLayer<Dtype> {
+ public:
+ explicit ACLConvolutionLayer(const LayerParameter& param)
+ : ACLOperator(param),ConvolutionLayer<Dtype>(param) {
+ }
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLConvolutionLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
}
- }
- int pad_data[3];
- if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
- pad_data[0] = conv_param.pad_h();
- pad_data[1] = conv_param.pad_w();
- } else {
- const int kDefaultPad = 0;
- const int num_pad_dims = conv_param.pad_size();
- for (int i = 0; i < 2; ++i) {
- pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
- conv_param.pad((num_pad_dims == 1) ? 0 : i);
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
}
- }
- if (use_direct_conv_ && ( (conv_param.kernel_size(0)==1 &&pad_data[0]==0 && pad_data[1]==0) || (conv_param.kernel_size(0)==3 && pad_data[0]<=1 && pad_data[1] <=1 ) )) {
- return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype, CLConvolutionLayer, NEDirectConvolutionLayer>(param)); //NEDirectConvolutionLayer only for 1x1 and 3x3
- }
- return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype, CLConvolutionLayer, NEConvolutionLayer>(param));
-}
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
+};
+
#endif
} // namespace caffe
diff --git a/include/caffe/layers/acl_inner_product_layer.hpp b/include/caffe/layers/acl_inner_product_layer.hpp
index f42becb0..67a29914 100644
--- a/include/caffe/layers/acl_inner_product_layer.hpp
+++ b/include/caffe/layers/acl_inner_product_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/inner_product_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -21,10 +21,11 @@ namespace caffe {
* Fallback to InnerProductLayer for some corner cases.
*/
template <typename Dtype>
-class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyConnectedLayer>,public InnerProductLayer<Dtype> {
+class ACLInnerProductLayer : public ACLOperator,public InnerProductLayer<Dtype> {
public:
explicit ACLInnerProductLayer(const LayerParameter& param)
- : InnerProductLayer<Dtype>(param) {}
+ : ACLOperator(param),InnerProductLayer<Dtype>(param) {
+ }
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +45,9 @@ class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyCo
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_local_connect_layer.hpp b/include/caffe/layers/acl_local_connect_layer.hpp
index fdb30757..b3b3a9e3 100644
--- a/include/caffe/layers/acl_local_connect_layer.hpp
+++ b/include/caffe/layers/acl_local_connect_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/local_connect_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -22,10 +22,10 @@ namespace caffe {
*
*/
template <typename Dtype>
-class ACLLocalConnectLayer : public ACLBaseLayer<CLLocallyConnectedLayer,NELocallyConnectedLayer>,public LocalConnectLayer<Dtype> {
+class ACLLocalConnectLayer : public ACLOperator,public LocalConnectLayer<Dtype> {
public:
explicit ACLLocalConnectLayer(const LayerParameter& param)
- : LocalConnectLayer<Dtype>(param) {}
+ : ACLOperator(param),LocalConnectLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -45,9 +45,9 @@ class ACLLocalConnectLayer : public ACLBaseLayer<CLLocallyConnectedLayer,NELocal
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
-
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_lrn_layer.hpp b/include/caffe/layers/acl_lrn_layer.hpp
index 6fd9fbc8..1a47f804 100644
--- a/include/caffe/layers/acl_lrn_layer.hpp
+++ b/include/caffe/layers/acl_lrn_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/lrn_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
* Fallback to LRNLayer for some corner cases.
*/
template <typename Dtype>
-class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLayer>,public LRNLayer<Dtype> {
+class ACLLRNLayer : public ACLOperator,public LRNLayer<Dtype> {
public:
explicit ACLLRNLayer(const LayerParameter& param)
- : LRNLayer<Dtype>(param) {}
+ : ACLOperator(param),LRNLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLaye
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_pooling_layer.hpp b/include/caffe/layers/acl_pooling_layer.hpp
index acca35cf..b7f94497 100644
--- a/include/caffe/layers/acl_pooling_layer.hpp
+++ b/include/caffe/layers/acl_pooling_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/pooling_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
* Fallback to PoolingLayer for some corner cases.
*/
template <typename Dtype>
-class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,public PoolingLayer<Dtype> {
+class ACLPoolingLayer : public ACLOperator,public PoolingLayer<Dtype> {
public:
explicit ACLPoolingLayer(const LayerParameter& param)
- : PoolingLayer<Dtype>(param) {}
+ : ACLOperator(param),PoolingLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,publi
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_relu_layer.hpp b/include/caffe/layers/acl_relu_layer.hpp
index 041dbecf..8bbb7267 100644
--- a/include/caffe/layers/acl_relu_layer.hpp
+++ b/include/caffe/layers/acl_relu_layer.hpp
@@ -11,7 +11,7 @@
#include "caffe/layers/relu_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#include "caffe/layers/acl_base_activation_layer.hpp"
#endif
@@ -46,8 +46,9 @@ class ACLReLULayer : public ACLBaseActivationLayer<Dtype>,public ReLULayer<Dtype
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_sigmoid_layer.hpp b/include/caffe/layers/acl_sigmoid_layer.hpp
index 8638f73e..0e4bcbaf 100644
--- a/include/caffe/layers/acl_sigmoid_layer.hpp
+++ b/include/caffe/layers/acl_sigmoid_layer.hpp
@@ -11,7 +11,7 @@
#include "caffe/layers/sigmoid_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#include "caffe/layers/acl_base_activation_layer.hpp"
#endif
@@ -45,8 +45,9 @@ class ACLSigmoidLayer : public ACLBaseActivationLayer<Dtype>,public SigmoidLayer
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_softmax_layer.hpp b/include/caffe/layers/acl_softmax_layer.hpp
index 9e450f5d..7e69992c 100644
--- a/include/caffe/layers/acl_softmax_layer.hpp
+++ b/include/caffe/layers/acl_softmax_layer.hpp
@@ -10,7 +10,7 @@
#include "caffe/layers/softmax_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#endif
namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
* Fallback to SoftmaxLayer for some corner cases.
*/
template <typename Dtype>
-class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,public SoftmaxLayer<Dtype> {
+class ACLSoftmaxLayer : public ACLOperator,public SoftmaxLayer<Dtype> {
public:
explicit ACLSoftmaxLayer(const LayerParameter& param)
- : SoftmaxLayer<Dtype>(param) {}
+ : ACLOperator(param),SoftmaxLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,publi
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/include/caffe/layers/acl_tanh_layer.hpp b/include/caffe/layers/acl_tanh_layer.hpp
index 5a74ce5a..717ef012 100644
--- a/include/caffe/layers/acl_tanh_layer.hpp
+++ b/include/caffe/layers/acl_tanh_layer.hpp
@@ -11,7 +11,7 @@
#include "caffe/layers/tanh_layer.hpp"
#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
#include "caffe/layers/acl_base_activation_layer.hpp"
#endif
@@ -46,8 +46,9 @@ class ACLTanHLayer : public ACLBaseActivationLayer<Dtype>,public TanHLayer<Dtype
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
NOT_IMPLEMENTED;
}
- virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+ virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+ virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
};
#endif
diff --git a/src/caffe/acl_layer.cpp b/src/caffe/acl_layer.cpp
deleted file mode 100644
index 879b6701..00000000
--- a/src/caffe/acl_layer.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
-
-unsigned int bypass_acl_class_layer = (0 | \
- /*0xffffffff |*/ \
- /*FLAGS_ENABLE_ACL_FC |*/ \
- /*FLAGS_ENABLE_ACL_LRN |*/ \
- 0 );
-
-#ifdef USE_PROFILING
-
-#include "arm_neon.h"
-
-unsigned int acl_log_flags = (0 | \
- MASK_LOG_APP_TIME | \
- /*MASK_LOG_ALLOCATE | */\
- /*MASK_LOG_ALLOCATE | */\
- /*MASK_LOG_RUN | */\
- /*MASK_LOG_CONFIG | */\
- /*MASK_LOG_COPY | */\
- MASK_LOG_ABSVAL | \
- MASK_LOG_BNLL | \
- MASK_LOG_CONV | \
- MASK_LOG_FC | \
- MASK_LOG_LRN | \
- MASK_LOG_POOLING | \
- MASK_LOG_RELU | \
- MASK_LOG_SIGMOID | \
- MASK_LOG_SOFTMAX | \
- MASK_LOG_TANH | \
- MASK_LOG_LC | \
- MASK_LOG_BN | \
- MASK_LOG_CONCAT | \
- 0);
-#include <stdio.h> /* printf */
-#include <stdlib.h> /* getenv */
-#endif //USE_PROFILING
-
-namespace caffe {
-template <typename GPULayer, typename CPULayer>
-ACLBaseLayer<GPULayer,CPULayer>::ACLBaseLayer()
- :init_layer_(true),force_bypass_acl_path_(false){
- const char* pBypassACL;
- pBypassACL = getenv ("BYPASSACL");
- if (pBypassACL){
- unsigned int bacl;
- sscanf(pBypassACL,"%i", &bacl);
- if(bacl != bypass_acl_class_layer){
- bypass_acl_class_layer = bacl;
- printf("BYPASSACL<%s>\n", pBypassACL);
- printf("BYPASSACL: %x\n", bypass_acl_class_layer);
- }
- }
-#ifdef USE_PROFILING
- const char* pLogACL;
- pLogACL = getenv("LOGACL");
- if (pLogACL){
- unsigned int alf;
- sscanf(pLogACL,"%i", &alf);
- if (alf != acl_log_flags){
- acl_log_flags = alf;
- printf("LOGACL<%s>\n", pLogACL);
- printf("LOGACL: %x\n", acl_log_flags);
- }
- }
-#endif //USE_PROFILING
-}
-template <typename GPULayer, typename CPULayer>
-void ACLBaseLayer<GPULayer,CPULayer>::gpu_run() {
- gpu_.run(true);
-}
-template <typename GPULayer, typename CPULayer>
-void ACLBaseLayer<GPULayer,CPULayer>::cpu_run() {
- cpu_.run(false);
-}
-
-template <typename GPULayer, typename CPULayer>
-ACLBaseLayer<GPULayer,CPULayer>::~ACLBaseLayer(){
-}
-template <typename GPULayer, typename CPULayer>
-template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem,bool share)
-{
- tensor=new ACLTensor(share);
-#if 1 //F32
- tensor->allocator()->init(TensorInfo(shape, Format::F32));
-#else //F16
- tensor->allocator()->init(TensorInfo(shape, Format::F16));
-#endif
- tensor->bindmem(mem,share);
- return true;
-}
-
-template <typename ACLTensor>
-void BaseTensor<ACLTensor>::commit(TensorType type){
- settensortype(type);
- if (!share_&&mem_) {
- if (!allocate_){
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_ALLOCATE_INFO);
-#endif //USE_PROFILING
- ACLTensor::allocator()->allocate();
- allocate_=true;
- }
- if (type_!= tensor_output) {
- tensor_copy(mem_);
- }
- mem_=nullptr;
- }
-}
-
-template <typename ACLTensor>
-int BaseTensor<ACLTensor>::tensor_copy(void * mem,bool toTensor)
-{
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_COPY_INFO);
-#endif //USE_PROFILING
- arm_compute::Window window;
- ACLTensor* tensor=this;
- window.use_tensor_dimensions(tensor->info()->tensor_shape(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
- int width = tensor->info()->tensor_shape()[0]; //->dimension(0); //window.x().end() - window.x().start(); // + 1;
- int height = tensor->info()->tensor_shape()[1]; //->dimension(1); //window.y().end() - window.y().start(); // + 1;
- int deepth = tensor->info()->tensor_shape()[2];
- map();
- // Create an iterator:
- arm_compute::Iterator it(tensor, window);
- // Except it works for an arbitrary number of dimensions
- if (toTensor) { //mem->tensor
- arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
- {
-#if 0 //F16
- if (tensor->info()->element_size() ==2)
- {
- for(int i = 0; i < width; i+= 4){
- auto pa = (float32x4_t*)((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4);
- *(float16x4_t*)(((char*)it.ptr()) + i*2) = vcvt_f16_f32(*pa);
- }
- }
- else{
-#endif
- memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size());
-#if 0 //F16
- }
-#endif
- },
- it);
- }else{ //tensor-->mem
- arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
- {
-#if 0 //F16
- if (tensor->info()->element_size() ==2)
- {
- for(int i = 0; i < width; i+= 4){
- auto pa = (float32x4_t*)(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4));
- *pa = vcvt_f32_f16(*(float16x4_t*)(((char*)it.ptr()) + i*2));
- }
- }
- else{
-#endif
- memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size());
-#if 0 //F16
- }
-#endif
- },
- it);
- }
- unmap();
-
- return 0;
-}
-
-template <typename GPULayer, typename CPULayer>
-template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share)
-{
- tensor->bindmem(mem,share);
- return true;
-}
-
-template <typename GPULayer, typename CPULayer>
-template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share)
-{
- if (mem==tensor->buffer()) return true;
- if (!share) {
- tensor->tensor_copy(mem,false);
- }
- return true;
-}
-
-
-template <typename GPULayer, typename CPULayer>
-bool ACLBaseLayer<GPULayer,CPULayer>::checkreshape(TensorShape shape,bool gpu, TensorType type)
-{
- if (gpu) {
- init_layer_ = gpu_.reshape(shape,type);
- }else{
- init_layer_ = cpu_.reshape(shape,type);
- }
- return init_layer_;
-}
-
-template <typename GPULayer, typename CPULayer>
-GPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_gpulayer(){
- gpu_.layer= new GPULayer;
- return gpu_.layer;
-}
-template <typename GPULayer, typename CPULayer>
-CPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_cpulayer(){
- cpu_.layer= new CPULayer;
- return cpu_.layer;
-}
-template <typename ACLLayer,typename ACLTensor>
-bool ACLXPUBaseLayer<ACLLayer,ACLTensor>::reshape(TensorShape &shape,TensorType type)
-{
- TensorShape _shape;
- if (!layer) return true;
-#ifdef USE_CONV_CACHE
- if (tensor_input == type){
- _shape = input->info()->tensor_shape();
- if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
- return false;
- }
- for(int i = 0; i < 16; ++i){
- if(cache.input[i] == nullptr) break;
- _shape = cache.input[i]->info()->tensor_shape();
- if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
- this->layer = cache.layer[i];
- this->input = cache.input[i];
- this->output = cache.output[i];
- this->weights = cache.weights[i];
- this->biases = cache.biases[i];
- return false;
- }
- }
- }
-#endif //USE_CONV_CACHE
- switch (type) {
- case tensor_biases:
- _shape = biases->info()->tensor_shape();
- break;
- case tensor_weights:
- _shape = weights->info()->tensor_shape();
- break;
- case tensor_output:
- _shape = output->info()->tensor_shape();
- break;
- case tensor_input:
- default:
- _shape = input->info()->tensor_shape();
- break;
- }
- if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
- return false;
- }
- freelayer();
- return true;
-}
-
-INSTANTIATE_ACLBASECLASS(CLNormalizationLayer,NENormalizationLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLActivationLayer,NEActivationLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLPoolingLayer,NEPoolingLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLSoftmaxLayer,NESoftmaxLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLFullyConnectedLayer,NEFullyConnectedLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEConvolutionLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEDirectConvolutionLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEDirectConvolutionLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEDirectConvolutionLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLBatchNormalizationLayer,NEBatchNormalizationLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLBatchNormalizationLayer,NEBatchNormalizationLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLBatchNormalizationLayer,NEBatchNormalizationLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLLocallyConnectedLayer,NELocallyConnectedLayer);
- INSTANTIATE_ACLBASE_FUNCTION(CLLocallyConnectedLayer,NELocallyConnectedLayer,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLLocallyConnectedLayer,NELocallyConnectedLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLDepthConcatenate,NEDepthConcatenate);
- INSTANTIATE_ACLBASE_FUNCTION(CLDepthConcatenate,NEDepthConcatenate,GPUTensor);
- INSTANTIATE_ACLBASE_FUNCTION(CLDepthConcatenate,NEDepthConcatenate,CPUTensor);
-}
-
-#endif
diff --git a/src/caffe/acl_operator.cpp b/src/caffe/acl_operator.cpp
new file mode 100644
index 00000000..b35265a8
--- /dev/null
+++ b/src/caffe/acl_operator.cpp
@@ -0,0 +1,227 @@
+#ifdef USE_ACL
+#include "caffe/acl_operator.hpp"
+#include "caffe/common.hpp"
+
+unsigned int bypass_acl_class_layer = (0 | \
+ FLAGS_ENABLE_ACL_CONCAT | \
+ /*0xffffffff |*/ \
+ /*FLAGS_ENABLE_ACL_FC |*/ \
+ /*FLAGS_ENABLE_ACL_LRN |*/ \
+ 0 );
+
+unsigned int openailab_intfp = 0;
+int enable_schedule=0;
+
+#ifdef USE_PROFILING
+
+#include "arm_neon.h"
+
+unsigned int acl_log_flags = (0 | \
+ MASK_LOG_APP_TIME | \
+ /*MASK_LOG_ALLOCATE | */\
+ /*MASK_LOG_ALLOCATE | */\
+ /*MASK_LOG_RUN | */\
+ /*MASK_LOG_CONFIG | */\
+ /*MASK_LOG_COPY | */\
+ MASK_LOG_ABSVAL | \
+ MASK_LOG_BNLL | \
+ MASK_LOG_CONV | \
+ MASK_LOG_FC | \
+ MASK_LOG_LRN | \
+ MASK_LOG_POOLING | \
+ MASK_LOG_RELU | \
+ MASK_LOG_SIGMOID | \
+ MASK_LOG_SOFTMAX | \
+ MASK_LOG_TANH | \
+ MASK_LOG_LC | \
+ MASK_LOG_BN | \
+ MASK_LOG_CONCAT | \
+ 0);
+#include <stdio.h> /* printf */
+#include <stdlib.h> /* getenv */
+#endif //USE_PROFILING
+
+namespace caffe {
+bool AclEnableSchedule(int enable){
+ enable_schedule=enable;
+ if (enable) {
+ Caffe::set_mode(Caffe::GPU);
+ }
+ return true;
+}
+int isScheduleEnable()
+{
+ return enable_schedule;
+}
+bool ACLOperator::init_cl_env=true;
+bool ACLOperator::support_opencl_=false;
+bool opencl_is_available()
+{
+ return arm_compute::opencl_is_available();
+}
+ACLOperator::ACLOperator(const LayerParameter& param)
+ :operator_state_(operator_not_init),force_bypass_acl_path_(false),
+ target_hint_(TargetHint::DONT_CARE),
+ convolution_method_hint_(ConvolutionMethodHint::GEMM),
+ _group(1),name_(""){
+ const char* pBypassACL;
+ if(init_cl_env){
+#ifdef USE_OPENCL
+ try {
+ if (opencl_is_available()) {
+ arm_compute::CLScheduler::get().default_init();
+ support_opencl_=true;
+ }
+ }catch(std::exception& e){
+ support_opencl_=false;
+ }
+#endif
+ init_cl_env=false;
+ }
+ pBypassACL = getenv ("BYPASSACL");
+ if (pBypassACL){
+ unsigned int bacl;
+ sscanf(pBypassACL,"%i", &bacl);
+ if(bacl != bypass_acl_class_layer){
+ bypass_acl_class_layer = bacl;
+ printf("BYPASSACL<%s>\n", pBypassACL);
+ printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+ }
+ }
+
+ const string& layer_type = param.type();
+ if (layer_type=="Convolution") {
+ ConvolutionParameter conv_param = param.convolution_param();
+ const char* pDirectConv;
+ unsigned int use_direct_conv=0;
+ pDirectConv = getenv ("DIRECTCONV");
+ if (pDirectConv){
+ unsigned int bdirectconv;
+ sscanf(pDirectConv,"%i", &bdirectconv);
+ if(bdirectconv != use_direct_conv){
+ use_direct_conv = bdirectconv;
+ printf("DIRECTCONV<%s>\n", pDirectConv);
+ printf("DIRECTCONV: %x\n", use_direct_conv);
+ }
+ }
+ int pad_data[3];
+ if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
+ pad_data[0] = conv_param.pad_h();
+ pad_data[1] = conv_param.pad_w();
+ } else {
+ const int kDefaultPad = 0;
+ const int num_pad_dims = conv_param.pad_size();
+ for (int i = 0; i < 2; ++i) {
+ pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
+ conv_param.pad((num_pad_dims == 1) ? 0 : i);
+ }
+ }
+ if (use_direct_conv && ( (conv_param.kernel_size(0)==1 &&pad_data[0]==0 && pad_data[1]==0) || (conv_param.kernel_size(0)==3 && pad_data[0]<=1 && pad_data[1] <=1 ) )) {
+ convolution_method_hint_=ConvolutionMethodHint::DIRECT; //NEDirectConvolutionLayer only for 1x1 and 3x3
+ }
+ }
+
+#ifdef USE_PROFILING
+ const char* pLogACL;
+ pLogACL = getenv("LOGACL");
+ if (pLogACL){
+ unsigned int alf;
+ sscanf(pLogACL,"%i", &alf);
+ if (alf != acl_log_flags){
+ acl_log_flags = alf;
+ printf("LOGACL<%s>\n", pLogACL);
+ printf("LOGACL: %x\n", acl_log_flags);
+ }
+ }
+#endif //USE_PROFILING
+ const char* pEnableSchedule;
+ pEnableSchedule = getenv ("ENABLESCHEDULE");
+ if (pEnableSchedule){
+ unsigned int bshedule;
+ sscanf(pEnableSchedule,"%i", &bshedule);
+ if(bshedule != enable_schedule){
+ enable_schedule = bshedule;
+ printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+ printf("ENABLESCHEDULE: %x\n", enable_schedule);
+ }
+ if (enable_schedule) {
+ AclEnableSchedule(1);
+ }
+ }
+}
+ACLOperator::~ACLOperator() {
+}
+
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor> &tensor,arm_compute::TensorShape &shape,void *mem,bool commit)
+{
+ auto acl_tensor=new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+ acl_tensor->set_target(getTargetHint());
+ acl_tensor->bindmem(mem);
+ if (commit) acl_tensor->commit();
+ tensor=(std::unique_ptr<ACLTensor>) std::move(acl_tensor);
+ return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor> &tensor,std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord)
+{
+ auto acl_tensor=new ACLSubTensor(parent,shape, coord);
+ acl_tensor->set_target(getTargetHint());
+ tensor=(std::unique_ptr<ACLSubTensor>) std::move(acl_tensor);
+ return true;
+}
+
+void ACLTensor::commit(TensorType type)
+{
+ settensortype(type);
+ if (mem_) {
+ if (!allocate_){
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif //USE_PROFILING
+ allocate();
+ allocate_=true;
+ }
+ if (type_!= tensor_output) {
+ tensor_copy(mem_);
+ }
+ mem_=nullptr;
+ }
+}
+
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor,void * mem,bool toTensor)
+{
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_COPY_INFO);
+#endif //USE_PROFILING
+ arm_compute::Window window;
+ window.use_tensor_dimensions(tensor->info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
+ int width = tensor->info()->tensor_shape()[0];
+ int height = tensor->info()->tensor_shape()[1];
+ int deepth = tensor->info()->tensor_shape()[2];
+ map();
+ // Create an iterator:
+ arm_compute::Iterator it(tensor, window);
+ // Except it works for an arbitrary number of dimensions
+ if (toTensor) { //mem->tensor
+ arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+ {
+ memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size());
+ },
+ it);
+ }else{ //tensor-->mem
+ arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+ {
+ memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size());
+ },
+ it);
+ }
+ unmap();
+
+ return 0;
+}
+
+INIT_GLOBAL_FUNCS();
+
+}
+
+
+#endif
diff --git a/src/caffe/acl_tensor.cpp b/src/caffe/acl_tensor.cpp
new file mode 100644
index 00000000..1ab82330
--- /dev/null
+++ b/src/caffe/acl_tensor.cpp
@@ -0,0 +1,138 @@
+#include "caffe/acl_tensor.hpp"
+
+namespace caffe {
+
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(arm_compute::TensorInfo &info)
+{
+ auto tensor = cpp14::make_unique<TensorType>();
+ tensor->allocator()->init(info);
+ return std::move(tensor);
+}
+
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor)
+{
+ auto itensor = dynamic_cast<TensorType *>(&tensor);
+ itensor->allocator()->allocate();
+}
+
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+ : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr)
+{
+}
+
+Tensor::Tensor(Tensor &&src) noexcept
+ : _target(src._target),
+ _info(std::move(src._info)),
+ _tensor(std::move(src._tensor))
+{
+}
+
+arm_compute::ITensor *Tensor::set_target(TargetHint target)
+{
+ switch(target)
+ {
+#ifdef USE_OPENCL
+ case TargetHint::OPENCL:
+ _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+ break;
+#endif
+ case TargetHint::NEON:
+ _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+ break;
+ default:
+ break;
+ }
+ _target = target;
+ return _tensor.get();
+}
+
+void Tensor::allocate()
+{
+ switch(_target)
+ {
+#ifdef USE_OPENCL
+ case TargetHint::OPENCL:
+ tensor_allocate<arm_compute::CLTensor>(*_tensor);
+ break;
+#endif
+ case TargetHint::NEON:
+ tensor_allocate<arm_compute::Tensor>(*_tensor);
+ break;
+ default:
+ break;
+ }
+}
+void Tensor::map(bool blocking){
+#ifdef USE_OPENCL
+ if (_target==TargetHint::OPENCL)
+ dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap(){
+#ifdef USE_OPENCL
+ if (_target==TargetHint::OPENCL)
+ dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#endif
+}
+
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, arm_compute::TensorShape shape, arm_compute::Coordinates coords)
+{
+ auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+ auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+ return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor* parent, arm_compute::TensorShape& tensor_shape, arm_compute::Coordinates& coords) noexcept
+ : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr)
+{
+ _parent = parent->tensor();
+ _target = parent->target();
+
+ instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target)
+{
+ return (target == _target) ? _subtensor.get() : nullptr;
+}
+
+arm_compute::ITensor *SubTensor::tensor()
+{
+ return _subtensor.get();
+}
+
+const arm_compute::ITensor *SubTensor::tensor() const
+{
+ return _subtensor.get();
+}
+
+TargetHint SubTensor::target() const
+{
+ return _target;
+}
+
+void SubTensor::allocate()
+{
+ // NOP for sub-tensors
+}
+
+void SubTensor::instantiate_subtensor()
+{
+ switch(_target)
+ {
+#ifdef USE_OPENCL
+ case TargetHint::OPENCL:
+ _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords);
+ break;
+#endif
+ default:
+ case TargetHint::NEON:
+ _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords);
+ break;
+ }
+}
+
+#endif
+}
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index f1db7dd4..dd800b04 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -7,10 +7,6 @@
#include "caffe/common.hpp"
#include "caffe/util/rng.hpp"
-#ifdef USE_ACL
-#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
-#endif
namespace caffe {
@@ -58,18 +54,8 @@ void GlobalInit(int* pargc, char*** pargv) {
Caffe::Caffe()
: random_generator_(), mode_(Caffe::CPU),use_mali_gpu_(false),
- solver_count_(1), solver_rank_(0), multiprocess_(false) {
-#ifdef USE_ACL
-
- try {
- CLScheduler::get().default_init();
- }
- catch(std::exception& e)
+ solver_count_(1), solver_rank_(0), multiprocess_(false)
{
- std::cout << "OPENCL initialization failed"<< std::endl;
- }
-
-#endif
}
Caffe::~Caffe() { }
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index f9e2908c..5e1011e6 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -56,7 +56,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
ConvolutionParameter conv_param = param.convolution_param();
ConvolutionParameter_Engine engine = conv_param.engine();
#ifdef USE_ACL
- return GetACLConvolutionLayer<Dtype>(param);
+ return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype>(param));
#endif
#ifdef USE_CUDNN
bool use_dilation = false;
diff --git a/src/caffe/layers/acl_absval_layer.cpp b/src/caffe/layers/acl_absval_layer.cpp
index b0b0304d..deea89c7 100644
--- a/src/caffe/layers/acl_absval_layer.cpp
+++ b/src/caffe/layers/acl_absval_layer.cpp
@@ -15,9 +15,9 @@ void ACLAbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
-void ACLAbsValLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
- ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::ABS);
+void ACLAbsValLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::ABS);
}
template <typename Dtype>
@@ -28,12 +28,21 @@ void ACLAbsValLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLAbsValLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLAbsValLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_ABSVAL_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
AbsValLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
@@ -46,7 +55,7 @@ void ACLAbsValLayer<Dtype>::Forward_gpu(
#ifdef USE_PROFILING
logtime_util log_time(ACL_ABSVAL_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
AbsValLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
diff --git a/src/caffe/layers/acl_base_activation_layer.cpp b/src/caffe/layers/acl_base_activation_layer.cpp
index 1fcd682c..cac524fb 100644
--- a/src/caffe/layers/acl_base_activation_layer.cpp
+++ b/src/caffe/layers/acl_base_activation_layer.cpp
@@ -11,50 +11,25 @@ void ACLBaseActivationLayer<Dtype>::LayerSetUp(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
}
template <typename Dtype>
-void ACLBaseActivationLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
+void ACLBaseActivationLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){
const unsigned int count = bottom[0]->count();
const unsigned int count_ = top[0]->count();
- TensorShape input_shape(count);
- TensorShape output_shape(count_);
- checkreshape(input_shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- this->init_layer_=false;
- // Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
+ arm_compute::TensorShape input_shape(count);
+ arm_compute::TensorShape output_shape(count_);
+ if (is_operator_init_done(input_shape)) return;
+ set_operator_init_done();
- this->force_bypass_acl_path_=false;
- ActivationLayerInfo act_info(type);
+ // Initialize ACL.
+ arm_compute::ActivationLayerInfo act_info(type);
- if(type== ActivationLayerInfo::ActivationFunction::TANH)
- act_info=ActivationLayerInfo(type,1.0,1.0);
-
-
+ if(type== arm_compute::ActivationLayerInfo::ActivationFunction::TANH)
+ act_info=arm_compute::ActivationLayerInfo(type,1.0,1.0);
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
- new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().output,act_info);
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
- new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().output,act_info);
- }
+ new_tensor(input(),input_shape,(void*)InputdataPtr(this,bottom));
+ new_tensor(output(),output_shape,(void*)OutputdataPtr(this,top));
+ acl_configure(activation,this,act_info);
}
template <typename Dtype>
void ACLBaseActivationLayer<Dtype>::Reshape(
@@ -64,27 +39,19 @@ void ACLBaseActivationLayer<Dtype>::Reshape(
template <typename Dtype>
void ACLBaseActivationLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
- Forward_gpu(bottom, top);
+ if(isGPUMode()){
+ ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom, top);
return;
}
- Dtype* top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- SetupACLLayer(bottom,top);
- tensor_mem(this->cpu().input,(void*)(bottom_data));
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
void ACLBaseActivationLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- Dtype* top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- SetupACLLayer(bottom,top);
- tensor_mem(this->gpu().input,(void*)(bottom_data));
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
diff --git a/src/caffe/layers/acl_base_conv_layer.cpp b/src/caffe/layers/acl_base_conv_layer.cpp
deleted file mode 100644
index e3c58996..00000000
--- a/src/caffe/layers/acl_base_conv_layer.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-#ifdef USE_ACL
-#include <algorithm>
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/acl_conv_layer.hpp"
-
-namespace caffe {
-
-bool use_direct_conv_=false;
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::LayerSetUp(
- const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
- ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
- this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top){
-
- TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::checkreshape(input_shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- this->init_layer_=false;
- // Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_gpulayer();
- }else{
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_cpulayer();
- }
- this->force_bypass_acl_path_=false;
- ConvolutionParameter conv_param = this->layer_param_.convolution_param();
- int stride_x =this->stride_.mutable_cpu_data()[1];
- int stride_y =this->stride_.mutable_cpu_data()[0];
- int pad_x=this->pad_.mutable_cpu_data()[1];
- int pad_y=this->pad_.mutable_cpu_data()[0];
- unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1];
- unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0];
- PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
- TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
- TensorShape biases_shape ((unsigned int)this->num_output_);
- TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
-
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- //[kernel_x, kernel_y, IFM, OFM]
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- //[OFM]
- if (this->bias_term_) {
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
- }
-
- //[width, height, IFM]
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
- //[width, height, OFM]
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- {
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info);
-#ifdef USE_PROFILING
- }
-#endif //USE_PROFILING
-#ifdef USE_CONV_CACHE
- for(int i = 0; i < 16; ++i){
- fprintf(stderr, "<GPU>check cache[%d]\n", i);
- if(this->gpu().cache.layer[i] == nullptr){
- this->gpu().cache.layer[i] = this->gpu().layer;
- this->gpu().cache.input[i] = this->gpu().input;
- this->gpu().cache.output[i] = this->gpu().output;
- this->gpu().cache.weights[i] = this->gpu().weights;
- this->gpu().cache.biases[i] = this->gpu().biases;
- break;
- }
- }
-#endif //USE_CONV_CACHE
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- //[kernel_x, kernel_y, IFM, OFM]
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- //[OFM]
- if (this->bias_term_) {
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
- }
-
- //[width, height, IFM]
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
- //[width, height, OFM]
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- {
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info);
-#ifdef USE_PROFILING
- }
-#endif //USE_PROFILING
-#ifdef USE_CONV_CACHE
- for(int i = 0; i < 16; ++i){
- fprintf(stderr, "<CPU>check cache[%d]\n", i);
- if(this->cpu().cache.layer[i] == nullptr){
- this->cpu().cache.layer[i] = this->cpu().layer;
- this->cpu().cache.input[i] = this->cpu().input;
- this->cpu().cache.output[i] = this->cpu().output;
- this->cpu().cache.weights[i] = this->cpu().weights;
- this->cpu().cache.biases[i] = this->cpu().biases;
- break;
- }
- }
-#endif //USE_CONV_CACHE
- }
-}
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Reshape(
- const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
- ConvolutionLayer<Dtype>::Reshape(bottom, top);
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Forward_cpu(
- const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
- Forward_gpu(bottom, top);
- return;
- }
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONV_INFO);
-#endif //USE_PROFILING
- if (this->force_bypass_acl_path_|| this->group_!=1) {
- ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
-
- ConvolutionParameter conv_param = this->layer_param_.convolution_param();
- if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) {
- ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
- /* check dilation */
- int dilated=0;
-
- for(int i=0;i<this->num_spatial_axes_;i++)
- {
- const int *p=this->dilation_.cpu_data();
-
- if(p[i]!=1)
- dilated=1;
- }
- if(dilated) {
- ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
-
- SetupACLLayer(bottom,top);
- for (int i = 0; i < bottom.size(); ++i) {
- const Dtype* bottom_data = bottom[i]->cpu_data();
- Dtype* top_data = top[i]->mutable_cpu_data();
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().input,(void*)bottom_data);
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::cpu_run();
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem((void*)top_data,this->cpu().output);
- }
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Forward_gpu(
- const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONV_INFO);
-#endif //USE_PROFILING
- ConvolutionParameter conv_param = this->layer_param_.convolution_param();
- if (this->force_bypass_acl_path_|| this->group_!=1) {
- ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
- if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 ) {
- ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
- /* check dilation */
- int dilated=0;
-
- for(int i=0;i<this->num_spatial_axes_;i++)
- {
- const int *p=this->dilation_.gpu_data();
-
- if(p[i]!=1)
- dilated=1;
- }
-
- if(dilated) {
- ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
- SetupACLLayer(bottom,top);
- for (int i = 0; i < bottom.size(); ++i) {
- const Dtype* bottom_data = bottom[i]->gpu_data();
- Dtype* top_data = top[i]->mutable_gpu_data();
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().input,(void*)bottom_data);
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::gpu_run();
- ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem((void*)top_data,this->gpu().output);
- }
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::~ACLConvolutionLayer() {
-}
-
-#ifdef USE_ACL
-INSTANTIATE_CONV_CLASS(ACLConvolutionLayer,CLConvolutionLayer,NEDirectConvolutionLayer);
-INSTANTIATE_CONV_CLASS(ACLConvolutionLayer,CLConvolutionLayer,NEConvolutionLayer);
-#endif
-
-} // namespace caffe
-#endif // USE_ACL
diff --git a/src/caffe/layers/acl_batch_norm_layer.cpp b/src/caffe/layers/acl_batch_norm_layer.cpp
index a6bc16d1..15df15cb 100644
--- a/src/caffe/layers/acl_batch_norm_layer.cpp
+++ b/src/caffe/layers/acl_batch_norm_layer.cpp
@@ -12,94 +12,43 @@ void ACLBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
}
template <typename Dtype>
-void ACLBatchNormLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLBatchNormLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
- if (!this->init_layer_) return;
- this->init_layer_=false;
- // Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
-
- this->force_bypass_acl_path_=false;
+ arm_compute::TensorShape in_shape ((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+ if (is_operator_init_done(in_shape)) return;
+ set_operator_init_done();
- TensorShape in_shape ((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
- TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
- TensorShape mean_shape((unsigned int)this->channels_);
- TensorShape var_shape=mean_shape;
- TensorShape beta_shape=mean_shape;
- TensorShape gamma_shape=mean_shape;
+ // Initialize ACL.
+ arm_compute::TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
+ arm_compute::TensorShape mean_shape((unsigned int)this->channels_);
+ arm_compute::TensorShape var_shape=mean_shape;
+ arm_compute::TensorShape beta_shape=mean_shape;
+ arm_compute::TensorShape gamma_shape=mean_shape;
Dtype beta_val[beta_shape.total_size()];
Dtype gamma_val[gamma_shape.total_size()];
-
for (int i=0;i<beta_shape.total_size();++i) {
beta_val[i]=0.0;
}
for (int i=0;i<gamma_shape.total_size();++i) {
gamma_val[i]=1.0;
}
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- // use the stored mean/variance estimates.
- const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
- 0 : 1 / this->blobs_[2]->cpu_data()[0];
- caffe_cpu_scale(this->variance_.count(), scale_factor,
- this->blobs_[0]->gpu_data(), this->mean_.mutable_gpu_data());
- caffe_cpu_scale(this->variance_.count(), scale_factor,
- this->blobs_[1]->gpu_data(), this->variance_.mutable_gpu_data());
- new_tensor(this->gpu().input,in_shape,(void*)bottom_data);
- new_tensor(this->gpu().output,out_shape,(void*)top_data);
- new_tensor(this->gpu().mean,mean_shape);
- new_tensor(this->gpu().var,var_shape);
- new_tensor(this->gpu().beta,beta_shape);
- new_tensor(this->gpu().gamma,gamma_shape);
- tensor_mem(this->gpu().mean,(void*)this->mean_.mutable_gpu_data());
- tensor_mem(this->gpu().var,(void*)this->variance_.mutable_gpu_data());
- tensor_mem(this->gpu().beta,(void*)beta_val);
- tensor_mem(this->gpu().gamma,(void*)gamma_val);
- this->gpu().mean->commit();
- this->gpu().var->commit();
- this->gpu().beta->commit();
- this->gpu().gamma->commit();
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().output,this->gpu().mean,this->gpu().var,this->gpu().beta,this->gpu().gamma,this->eps_);
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- // use the stored mean/variance estimates.
- const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
- 0 : 1 / this->blobs_[2]->cpu_data()[0];
- caffe_cpu_scale(this->variance_.count(), scale_factor,
- this->blobs_[0]->cpu_data(), this->mean_.mutable_cpu_data());
- caffe_cpu_scale(this->variance_.count(), scale_factor,
- this->blobs_[1]->cpu_data(), this->variance_.mutable_cpu_data());
- new_tensor(this->cpu().input,in_shape,(void*)bottom_data);
- new_tensor(this->cpu().output,out_shape,(void*)top_data);
- new_tensor(this->cpu().mean,mean_shape);
- new_tensor(this->cpu().var,var_shape);
- new_tensor(this->cpu().beta,beta_shape);
- new_tensor(this->cpu().gamma,gamma_shape);
- tensor_mem(this->cpu().mean,(void*)this->mean_.mutable_cpu_data());
- tensor_mem(this->cpu().var,(void*)this->variance_.mutable_cpu_data());
- tensor_mem(this->cpu().beta,(void*)beta_val);
- tensor_mem(this->cpu().gamma,(void*)gamma_val);
- this->cpu().mean->commit();
- this->cpu().var->commit();
- this->cpu().beta->commit();
- this->cpu().gamma->commit();
+ new_tensor(input(),in_shape,InputdataPtr(this,bottom));
+ new_tensor(output(),out_shape,OutputdataPtr(this,top));
+ // use the stored mean/variance estimates.
+ const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
+ 0 : 1 / this->blobs_[2]->cpu_data()[0];
+ caffe_cpu_scale(this->variance_.count(), scale_factor,
+ this->blobs_[0]->cpu_data(), GetDataPtr(this,&this->mean_));
+ caffe_cpu_scale(this->variance_.count(), scale_factor,
+ this->blobs_[1]->cpu_data(), GetDataPtr(this,&this->variance_));
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().output,this->cpu().mean,this->cpu().var,this->cpu().beta,this->cpu().gamma,this->eps_);
- }
+ new_tensor(mean(),mean_shape,GetDataPtr(this,&this->mean_));
+ new_tensor(var(),var_shape,GetDataPtr(this,&this->variance_));
+ new_tensor(beta(),beta_shape,(void*)beta_val,true);
+ new_tensor(gamma(),gamma_shape,(void*)gamma_val,true);
+ acl_configure(bn,this,this->eps_);
}
template <typename Dtype>
void ACLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -109,43 +58,48 @@ void ACLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLBatchNormLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_||!this->use_global_stats_) {
+ bypass_acl=true;
+ }
+ if (isScheduleEnable()) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_BN_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_||!this->use_global_stats_) {
+ if (Bypass_acl(bottom,top)) {
BatchNormLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- const Dtype* bottom_data = bottom[0]->cpu_data();
- Dtype* top_data = top[0]->mutable_cpu_data();
- SetupACLLayer(bottom,top);
- tensor_mem(this->cpu().input,(void*)(bottom_data));
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
void ACLBatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if (this->force_bypass_acl_path_||!this->use_global_stats_) {
- BatchNormLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
#ifdef USE_PROFILING
logtime_util log_time(ACL_BN_INFO);
#endif //USE_PROFILING
- const Dtype* bottom_data = bottom[0]->gpu_data();
- Dtype* top_data = top[0]->mutable_gpu_data();
- SetupACLLayer(bottom,top);
- tensor_mem(this->gpu().input,(void*)(bottom_data));
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ if (Bypass_acl(bottom,top)) {
+ BatchNormLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
diff --git a/src/caffe/layers/acl_bnll_layer.cpp b/src/caffe/layers/acl_bnll_layer.cpp
index 86f09831..20903cda 100644
--- a/src/caffe/layers/acl_bnll_layer.cpp
+++ b/src/caffe/layers/acl_bnll_layer.cpp
@@ -14,9 +14,9 @@ void ACLBNLLLayer<Dtype>::LayerSetUp(
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BNLL;
}
template <typename Dtype>
-void ACLBNLLLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){
- ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::SOFT_RELU);
+void ACLBNLLLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU);
}
template <typename Dtype>
void ACLBNLLLayer<Dtype>::Reshape(
@@ -26,12 +26,21 @@ void ACLBNLLLayer<Dtype>::Reshape(
}
template <typename Dtype>
+bool ACLBNLLLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLBNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_BNLL_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
BNLLLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
@@ -44,7 +53,7 @@ void ACLBNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_BNLL_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
BNLLLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
diff --git a/src/caffe/layers/acl_concat_layer.cpp b/src/caffe/layers/acl_concat_layer.cpp
index 57a14126..d849a9e0 100644
--- a/src/caffe/layers/acl_concat_layer.cpp
+++ b/src/caffe/layers/acl_concat_layer.cpp
@@ -9,90 +9,66 @@ template <typename Dtype>
void ACLConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
ConcatLayer<Dtype>::LayerSetUp(bottom, top);
- //this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
- this->force_bypass_acl_path_= true;
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
}
template <typename Dtype>
-void ACLConcatLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLConcatLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
unsigned int channels=0;
for (int i = 0; i < bottom.size(); ++i) {
channels+=bottom[i]->channels();
}
- TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),channels);
+ arm_compute::TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),channels);
+
+ if (is_operator_init_done(out_shape,tensor_output)) return;
+ set_operator_init_done();
- if (!this->init_layer_) return;
- this->init_layer_=false;
// Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
+ std::vector<arm_compute::TensorShape> shapes;
+ for (int i = 0; i < bottom.size(); ++i) {
+ arm_compute::TensorShape in_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[i]->channels());
+ new_tensor(cinput(i),in_shape,InputdataPtr(this,bottom,i));
}
+ new_tensor(output(),out_shape,OutputdataPtr(this,top));
+ acl_configure(concat,this,bottom.size());
- this->force_bypass_acl_path_=false;
-
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- for (int i = 0; i < bottom.size(); ++i) {
- const Dtype* bottom_data = bottom[i]->gpu_data();
- TensorShape vec_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[0]->channels());
- GPUTensor *vector;
- new_tensor(vector,vec_shape,(void*)bottom_data);
- tensor_mem(vector,(void*)bottom_data);
- vector->commit();
- gpu_vectors.push_back(vector);
- }
- new_tensor(this->gpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(gpu_vectors,this->gpu().output);
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- for (int i = 0; i < bottom.size(); ++i) {
- const Dtype* bottom_data = bottom[i]->cpu_data();
- TensorShape vec_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[0]->channels());
- CPUTensor *vector;
- new_tensor(vector,vec_shape,(void*)bottom_data);
- tensor_mem(vector,(void*)bottom_data);
- vector->commit();
- cpu_vectors.push_back(vector);
- }
- new_tensor(this->cpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(cpu_vectors,this->cpu().output);
- }
}
template <typename Dtype>
void ACLConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- ConcatLayer<Dtype>::Reshape(bottom, top);
+ ConcatLayer<Dtype>::Reshape(bottom, top);
+}
+template <typename Dtype>
+bool ACLConcatLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_||this->concat_axis_==0) {
+ bypass_acl=true;
+ }
+ if(isScheduleEnable()){
+ bypass_acl=true;
+ }
+ return bypass_acl;
}
template <typename Dtype>
void ACLConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_CONCAT_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_||this->concat_axis_==0) {
+ if (Bypass_acl(bottom,top)) {
ConcatLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- Dtype* top_data = top[0]->mutable_cpu_data();
- SetupACLLayer(bottom,top);
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top,false);
}
template <typename Dtype>
@@ -101,27 +77,16 @@ void ACLConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_CONCAT_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_||this->concat_axis_==0) {
+ if (Bypass_acl(bottom,top)) {
ConcatLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- Dtype* top_data = top[0]->mutable_gpu_data();
- SetupACLLayer(bottom,top);
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top,false);
}
template <typename Dtype>
ACLConcatLayer<Dtype>::~ACLConcatLayer() {
- if(this->force_bypass_acl_path_)return;
- for (int i =0; i < cpu_vectors.size(); i ++) {
- delete cpu_vectors[i];
- }
- for (int i =0; i < gpu_vectors.size(); i ++) {
- delete gpu_vectors[i];
- }
- cpu_vectors.erase(cpu_vectors.begin());
- gpu_vectors.erase(gpu_vectors.begin());
}
INSTANTIATE_CLASS(ACLConcatLayer);
diff --git a/src/caffe/layers/acl_conv_layer.cpp b/src/caffe/layers/acl_conv_layer.cpp
new file mode 100644
index 00000000..5cc6fcd9
--- /dev/null
+++ b/src/caffe/layers/acl_conv_layer.cpp
@@ -0,0 +1,147 @@
+#ifdef USE_ACL
+#include <algorithm>
+#include <vector>
+#include "caffe/filler.hpp"
+#include "caffe/layers/acl_conv_layer.hpp"
+
+namespace caffe {
+
+bool use_direct_conv_=false;
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::LayerSetUp(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+
+ arm_compute::TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+ if (is_operator_init_done(input_shape)) return;
+ set_operator_init_done();
+
+ // Initialize ACL.
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ int stride_x =this->stride_.mutable_cpu_data()[1];
+ int stride_y =this->stride_.mutable_cpu_data()[0];
+ int pad_x=this->pad_.mutable_cpu_data()[1];
+ int pad_y=this->pad_.mutable_cpu_data()[0];
+ unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1];
+ unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0];
+ arm_compute::PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
+ arm_compute::TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_/this->group_, (unsigned int)this->num_output_);
+ arm_compute::TensorShape biases_shape ((unsigned int)this->num_output_);
+ arm_compute::TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
+ group()=this->group_;
+
+ //[kernel_x, kernel_y, IFM, OFM]
+ new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get()));
+ //[OFM]
+ if (this->bias_term_) {
+ new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get()));
+ }
+
+ //[width, height, IFM]
+ new_tensor(input(),input_shape,InputdataPtr(this,bottom));
+ //[width, height, OFM]
+ new_tensor(output(),output_shape,OutputdataPtr(this,top));
+
+ acl_configure(conv,this,conv_info);
+}
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Reshape(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ ConvolutionLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+bool ACLConvolutionLayer<Dtype>::Bypass_acl(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_|| ((openailab_intfp==0) && (this->group_>=5)) //for performance, more groups impact GPU performance
+ || ((openailab_intfp != 0 && (top[0]->channels() / this->group_ == 1)))) {
+ bypass_acl=true;
+ }
+
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) {
+ bypass_acl=true;
+ }
+ /* check dilation */
+ int dilated=0;
+
+ for(int i=0;i<this->num_spatial_axes_;i++)
+ {
+ const int *p=this->dilation_.cpu_data();
+
+ if(p[i]!=1)
+ dilated=1;
+ }
+ if(dilated) {
+ bypass_acl=true;
+ }
+
+
+ if((this->kernel_shape_.mutable_cpu_data()[1]==1||this->kernel_shape_.mutable_cpu_data()[0]==1) &&
+ isScheduleEnable()){
+ bypass_acl=true;
+ }
+ if((this->kernel_shape_.mutable_cpu_data()[1]==3||this->kernel_shape_.mutable_cpu_data()[0]==3) &&
+ (bottom[0]->channels()<150) && isScheduleEnable()){
+ bypass_acl=true;
+ }
+
+ return bypass_acl;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Forward_cpu(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ if(isGPUMode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONV_INFO);
+#endif //USE_PROFILING
+
+ if (Bypass_acl(bottom,top)) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+
+ SetupACLOperator(bottom,top);
+
+ // acl fp
+ if (openailab_intfp==0){
+ caffe::acl_run(this,bottom,top);
+ }
+ return;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Forward_gpu(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONV_INFO);
+#endif //USE_PROFILING
+ if (Bypass_acl(bottom,top)) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
+}
+
+template <typename Dtype>
+ACLConvolutionLayer<Dtype>::~ACLConvolutionLayer() {
+}
+
+#ifdef USE_ACL
+INSTANTIATE_CLASS(ACLConvolutionLayer);
+#endif
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_inner_product_layer.cpp b/src/caffe/layers/acl_inner_product_layer.cpp
index 47d1011f..bb819fcc 100644
--- a/src/caffe/layers/acl_inner_product_layer.cpp
+++ b/src/caffe/layers/acl_inner_product_layer.cpp
@@ -1,6 +1,5 @@
#ifdef USE_ACL
#include <vector>
-
#include "caffe/filler.hpp"
#include "caffe/layers/acl_inner_product_layer.hpp"
#include "caffe/util/math_functions.hpp"
@@ -15,65 +14,30 @@ void ACLInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_FC;
}
template <typename Dtype>
-void ACLInnerProductLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLInnerProductLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
- TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_);
- TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_);
- TensorShape biases_shape((unsigned int)this->N_);
- TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_);
- TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_);
- checkreshape(input_shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- this->init_layer_=false;
+ arm_compute::TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_);
+ arm_compute::TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_);
+ arm_compute::TensorShape biases_shape((unsigned int)this->N_);
+ arm_compute::TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_);
+ arm_compute::TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_);
+ if (is_operator_init_done(input_shape)) return;
+ set_operator_init_done();
// Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
bool transpose = !this->layer_param_.inner_product_param().transpose();
- this->force_bypass_acl_path_ = false;
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- if (transpose) {
- new_tensor(this->gpu().weights,weights_shape_t,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- }else{
- new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- }
- tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- if (this->bias_term_) {
- new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
- tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
- }
- new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
- new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,transpose);
+ if (transpose) {
+ new_tensor(weights(),weights_shape_t,GetDataPtr(this,this->blobs_[0].get()));
}else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- if (transpose) {
- new_tensor(this->cpu().weights,weights_shape_t,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- }else{
- new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- }
- tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- if (this->bias_term_) {
- new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
- tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
- }
- new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
- new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,transpose);
+ new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get()));
+ }
+ if (this->bias_term_) {
+ new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get()));
}
+ new_tensor(input(),input_shape,InputdataPtr(this,bottom));
+ new_tensor(output(),output_shape,OutputdataPtr(this,top));
+ acl_configure(fc,this,transpose);
}
template <typename Dtype>
void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -82,25 +46,40 @@ void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLInnerProductLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_FC_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom, top)) {
InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- Dtype* top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- SetupACLLayer(bottom,top);
- tensor_mem(this->cpu().input,(void*)(bottom_data));
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ SetupACLOperator(bottom,top);
+
+ if (this->M_ != 1 && openailab_intfp != 0){
+ InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+
+ // ACL FP
+ if(openailab_intfp == 0){
+ caffe::acl_run(this,bottom,top);
+ }
+ return;
}
template <typename Dtype>
@@ -109,16 +88,12 @@ void ACLInnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom
#ifdef USE_PROFILING
logtime_util log_time(ACL_FC_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom, top)) {
InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- Dtype* top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- SetupACLLayer(bottom,top);
- tensor_mem(this->gpu().input,(void*)(bottom_data));
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
diff --git a/src/caffe/layers/acl_local_connect_layer.cpp b/src/caffe/layers/acl_local_connect_layer.cpp
index 4eed72f9..1846faf4 100644
--- a/src/caffe/layers/acl_local_connect_layer.cpp
+++ b/src/caffe/layers/acl_local_connect_layer.cpp
@@ -15,20 +15,14 @@ void ACLLocalConnectLayer<Dtype>::LayerSetUp(
}
template <typename Dtype>
-void ACLLocalConnectLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLLocalConnectLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
- TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
- checkreshape(input_shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- this->init_layer_=false;
- // Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
- this->force_bypass_acl_path_=false;
+ arm_compute::TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+ if (is_operator_init_done(input_shape)) return;
+ set_operator_init_done();
+
+ // Initialize ACL.
ConvolutionParameter conv_param = this->layer_param_.convolution_param();
int stride_x =this->stride_;
int stride_y =this->stride_;
@@ -36,73 +30,23 @@ void ACLLocalConnectLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bott
int pad_y=this->pad_;
unsigned int kernel_x=this->kernel_size_;
unsigned int kernel_y=this->kernel_size_;
- PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
- TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
- TensorShape biases_shape ((unsigned int)this->num_output_);
- TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
-
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- //[kernel_x, kernel_y, IFM, OFM]
- new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
- //[OFM]
- if (this->bias_term_) {
- new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
- tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
- }
+ arm_compute::PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
+ arm_compute::TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
+ arm_compute::TensorShape biases_shape ((unsigned int)this->num_output_);
+ arm_compute::TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
- //[width, height, IFM]
- new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
- //[width, height, OFM]
- new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- {
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info);
-#ifdef USE_PROFILING
- }
-#endif //USE_PROFILING
-#ifdef USE_CONV_CACHE
- for(int i = 0; i < 16; ++i){
- fprintf(stderr, "<GPU>check cache[%d]\n", i);
- if(this->gpu().cache.layer[i] == nullptr){
- this->gpu().cache.layer[i] = this->gpu().layer;
- this->gpu().cache.input[i] = this->gpu().input;
- this->gpu().cache.output[i] = this->gpu().output;
- this->gpu().cache.weights[i] = this->gpu().weights;
- this->gpu().cache.biases[i] = this->gpu().biases;
- break;
- }
- }
-#endif //USE_CONV_CACHE
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- //[kernel_x, kernel_y, IFM, OFM]
- new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
- //[OFM]
- if (this->bias_term_) {
- new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
- tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
- }
-
- //[width, height, IFM]
- new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
- //[width, height, OFM]
- new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
- {
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info);
-#ifdef USE_PROFILING
- }
-#endif //USE_PROFILING
+ //[kernel_x, kernel_y, IFM, OFM]
+ new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get()));
+ //[OFM]
+ if (this->bias_term_) {
+ new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get()));
}
+
+ //[width, height, IFM]
+ new_tensor(input(),input_shape,InputdataPtr(this,bottom));
+ //[width, height, OFM]
+ new_tensor(output(),output_shape,OutputdataPtr(this,top));
+ acl_configure(lc,this,conv_info);
}
template <typename Dtype>
void ACLLocalConnectLayer<Dtype>::Reshape(
@@ -111,34 +55,37 @@ void ACLLocalConnectLayer<Dtype>::Reshape(
}
template <typename Dtype>
+bool ACLLocalConnectLayer<Dtype>::Bypass_acl(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ if (conv_param.kernel_size_size()>2 ) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLLocalConnectLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_LC_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
- LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
-
- ConvolutionParameter conv_param = this->layer_param_.convolution_param();
- if (conv_param.kernel_size_size()>2 ) {
+ if (Bypass_acl(bottom,top)) {
LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- SetupACLLayer(bottom,top);
- for (int i = 0; i < bottom.size(); ++i) {
- const Dtype* bottom_data = bottom[i]->cpu_data();
- Dtype* top_data = top[i]->mutable_cpu_data();
- tensor_mem(this->cpu().input,(void*)bottom_data);
- cpu_run();
- tensor_mem((void*)top_data,this->cpu().output);
- }
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
@@ -148,22 +95,12 @@ void ACLLocalConnectLayer<Dtype>::Forward_gpu(
logtime_util log_time(ACL_LC_INFO);
#endif //USE_PROFILING
ConvolutionParameter conv_param = this->layer_param_.convolution_param();
- if (this->force_bypass_acl_path_) {
- LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
- if (conv_param.kernel_size_size()>2 ) {
+ if (Bypass_acl(bottom,top)) {
LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- SetupACLLayer(bottom,top);
- for (int i = 0; i < bottom.size(); ++i) {
- const Dtype* bottom_data = bottom[i]->gpu_data();
- Dtype* top_data = top[i]->mutable_gpu_data();
- tensor_mem(this->gpu().input,(void*)bottom_data);
- gpu_run();
- tensor_mem((void*)top_data,this->gpu().output);
- }
+ SetupACLOperator(bottom,top);
+ caffe::acl_run(this,bottom,top);
}
template <typename Dtype>
diff --git a/src/caffe/layers/acl_lrn_layer.cpp b/src/caffe/layers/acl_lrn_layer.cpp
index db9630da..2a94e010 100644
--- a/src/caffe/layers/acl_lrn_layer.cpp
+++ b/src/caffe/layers/acl_lrn_layer.cpp
@@ -5,7 +5,7 @@
namespace caffe {
-const NormType IN_MAP=(arm_compute::NormType)0;
+const arm_compute::NormType IN_MAP=(arm_compute::NormType)0;
template <typename Dtype>
void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
@@ -13,46 +13,24 @@ void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
}
template <typename Dtype>
-void ACLLRNLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLLRNLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
- TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_);
- checkreshape(shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- // Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
+ arm_compute::TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_);
+ if (is_operator_init_done(shape)) return;
+ set_operator_init_done();
- //this->force_bypass_acl_path_=false;
- NormalizationLayerInfo *norm_info;
+ // Initialize ACL.
+ arm_compute::NormalizationLayerInfo norm_info(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
if(this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL)
- norm_info=new NormalizationLayerInfo(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+ norm_info=arm_compute::NormalizationLayerInfo(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
else
- norm_info=new NormalizationLayerInfo(NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+ norm_info=arm_compute::NormalizationLayerInfo(arm_compute::NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+
+ new_tensor(input(),shape,InputdataPtr(this,bottom));
+ new_tensor(output(),shape,OutputdataPtr(this,top));
+ acl_configure(lrn,this,norm_info);
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- new_tensor(this->gpu().input,shape,(void*)bottom_data);
- new_tensor(this->gpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().output,*norm_info);
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- new_tensor(this->cpu().input,shape,(void*)bottom_data);
- new_tensor(this->cpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().output,*norm_info);
- }
- delete norm_info;
}
template <typename Dtype>
void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -62,35 +40,41 @@ void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLLRNLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLLRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_LRN_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) {
+ if (Bypass_acl(bottom, top)) {
LRNLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
- SetupACLLayer(bottom,top);
+ SetupACLOperator(bottom,top);
switch (this->layer_param_.lrn_param().norm_region()) {
case LRNParameter_NormRegion_ACROSS_CHANNELS:
for (int n = 0; n < this->num_; ++n) {
- tensor_mem(this->cpu().input,(void*)(bottom_data+ bottom[0]->offset(n)));
- cpu_run();
- tensor_mem((void*)(top_data + top[0]->offset(n)),this->cpu().output);
+ acl_run((void*)(bottom_data+ bottom[0]->offset(n)),(void*)(top_data + top[0]->offset(n)));
}
break;
case LRNParameter_NormRegion_WITHIN_CHANNEL:
for (int n = 0; n < bottom[0]->num(); ++n) {
- tensor_mem(this->cpu().input,(void*)(bottom_data));
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ acl_run((void*)bottom_data,(void*)top_data);
bottom_data += bottom[0]->offset(0, 1);
top_data += top[0]->offset(0, 1);
}
@@ -106,26 +90,22 @@ void ACLLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_LRN_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom, top)) {
LRNLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
- SetupACLLayer(bottom,top);
+ SetupACLOperator(bottom,top);
switch (this->layer_param_.lrn_param().norm_region()) {
case LRNParameter_NormRegion_ACROSS_CHANNELS:
for (int n = 0; n < this->num_; ++n) {
- tensor_mem(this->gpu().input,(void*)(bottom_data+ bottom[0]->offset(n)));
- gpu_run();
- tensor_mem((void*)(top_data + top[0]->offset(n)),this->gpu().output);
+ acl_run((void*)(bottom_data+ bottom[0]->offset(n)),(void*)(top_data + top[0]->offset(n)));
}
break;
case LRNParameter_NormRegion_WITHIN_CHANNEL:
for (int n = 0; n < bottom[0]->num(); ++n) {
- tensor_mem(this->gpu().input,(void*)(bottom_data));
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ acl_run((void*)bottom_data,(void*)top_data);
bottom_data += bottom[0]->offset(0, 1);
top_data += top[0]->offset(0, 1);
}
diff --git a/src/caffe/layers/acl_pooling_layer.cpp b/src/caffe/layers/acl_pooling_layer.cpp
index f72b2235..f62fb5d4 100644
--- a/src/caffe/layers/acl_pooling_layer.cpp
+++ b/src/caffe/layers/acl_pooling_layer.cpp
@@ -12,48 +12,25 @@ void ACLPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
}
template <typename Dtype>
-void ACLPoolingLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLPoolingLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
- TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_,(unsigned int)this->channels_);
- TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_,(unsigned int)this->channels_);
- checkreshape(in_shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- this->init_layer_=false;
- // Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
+ arm_compute::TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_,(unsigned int)this->channels_);
+ arm_compute::TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_,(unsigned int)this->channels_);
+ if (is_operator_init_done(in_shape)) return;
+ set_operator_init_done();
- this->force_bypass_acl_path_=false;
- PoolingLayerInfo *pool_info;
+ // Initialize ACL.
+ arm_compute::PoolingLayerInfo pool_info;
if(this->layer_param_.pooling_param().pool()==PoolingParameter_PoolMethod_MAX)
- pool_info=new PoolingLayerInfo(PoolingType::MAX, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL));
+ pool_info=arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX, this->kernel_w_, arm_compute::PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,arm_compute::DimensionRoundingType::CEIL));
else
- pool_info=new PoolingLayerInfo(PoolingType::AVG, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL));
+ pool_info=arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG, this->kernel_w_, arm_compute::PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,arm_compute::DimensionRoundingType::CEIL));
+
+ new_tensor(input(),in_shape,InputdataPtr(this,bottom));
+ new_tensor(output(),out_shape,OutputdataPtr(this,top));
+ acl_configure(pooling,this,pool_info);
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- new_tensor(this->gpu().input,in_shape,(void*)bottom_data);
- new_tensor(this->gpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().output,*pool_info);
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- new_tensor(this->cpu().input,in_shape,(void*)bottom_data);
- new_tensor(this->cpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().output,*pool_info);
- }
- delete pool_info;
}
template <typename Dtype>
void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -63,39 +40,44 @@ void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLPoolingLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) {
+ bypass_acl=true;
+ }
+ if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX &&
+ this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
+ bypass_acl=true;
+ }
+ if (this->kernel_h_!=this->kernel_w_) {
+ bypass_acl=true;
+ }
+ if (this->kernel_h_!=2 && this->kernel_h_!=3) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_POOLING_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) {
+ if (Bypass_acl(bottom,top)) {
PoolingLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
- if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX &&
- this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
- PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return ;
- }
- if (this->kernel_h_!=this->kernel_w_ || top.size()>1) {
- PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return ;
- }
- if (this->kernel_h_!=2 && this->kernel_h_!=3) {
- PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return ;
- }
- SetupACLLayer(bottom,top);
+ SetupACLOperator(bottom,top);
for (int n = 0; n < bottom[0]->num(); ++n) {
- tensor_mem(this->cpu().input,(void*)(bottom_data));
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ acl_run((void*)bottom_data,(void*)top_data);
bottom_data += bottom[0]->offset(1);
top_data += top[0]->offset(1);
}
@@ -107,30 +89,15 @@ void ACLPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_POOLING_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) {
+ if (Bypass_acl(bottom,top)) {
PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return;
+ return ;
}
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
- if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX &&
- this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
- PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return ;
- }
- if (this->kernel_h_!=this->kernel_w_) {
- PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return ;
- }
- if (this->kernel_h_!=2 && this->kernel_h_!=3) {
- PoolingLayer<Dtype>::Forward_cpu(bottom,top);
- return ;
- }
- SetupACLLayer(bottom,top);
+ SetupACLOperator(bottom,top);
for (int n = 0; n < bottom[0]->num(); ++n) {
- tensor_mem(this->gpu().input,(void*)(bottom_data));
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ acl_run((void*)bottom_data,(void*)top_data);
bottom_data += bottom[0]->offset(1);
top_data += top[0]->offset(1);
}
diff --git a/src/caffe/layers/acl_relu_layer.cpp b/src/caffe/layers/acl_relu_layer.cpp
index 03194539..2b712dda 100644
--- a/src/caffe/layers/acl_relu_layer.cpp
+++ b/src/caffe/layers/acl_relu_layer.cpp
@@ -13,9 +13,9 @@ void ACLReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
}
template <typename Dtype>
-void ACLReLULayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLReLULayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
- ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::RELU);
+ ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
}
template <typename Dtype>
void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -25,20 +25,32 @@ void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLReLULayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+ // Fallback to standard Caffe for leaky ReLU.
+ if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
+ bypass_acl=true;
+ }
+ if (isScheduleEnable()) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_RELU_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
ReLULayer<Dtype>::Forward_cpu(bottom,top);
return;
}
- // Fallback to standard Caffe for leaky ReLU.
- if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
- ReLULayer<Dtype>::Forward_cpu(bottom, top);
- return;
- }
ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
}
@@ -48,12 +60,7 @@ void ACLReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_RELU_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
- ReLULayer<Dtype>::Forward_cpu(bottom,top);
- return;
- }
- // Fallback to standard Caffe for leaky ReLU.
- if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
+ if (Bypass_acl(bottom,top)) {
ReLULayer<Dtype>::Forward_cpu(bottom, top);
return;
}
diff --git a/src/caffe/layers/acl_sigmoid_layer.cpp b/src/caffe/layers/acl_sigmoid_layer.cpp
index eac15651..4b3f660c 100644
--- a/src/caffe/layers/acl_sigmoid_layer.cpp
+++ b/src/caffe/layers/acl_sigmoid_layer.cpp
@@ -14,9 +14,9 @@ void ACLSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
-void ACLSigmoidLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
- ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::LOGISTIC);
+void ACLSigmoidLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC);
}
template <typename Dtype>
void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -26,12 +26,21 @@ void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLSigmoidLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLSigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_SIGMOID_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
SigmoidLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
@@ -44,7 +53,7 @@ void ACLSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_SIGMOID_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
SigmoidLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
diff --git a/src/caffe/layers/acl_softmax_layer.cpp b/src/caffe/layers/acl_softmax_layer.cpp
index 1568d3aa..6d3cd93e 100644
--- a/src/caffe/layers/acl_softmax_layer.cpp
+++ b/src/caffe/layers/acl_softmax_layer.cpp
@@ -13,42 +13,19 @@ void ACLSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
}
template <typename Dtype>
-void ACLSoftmaxLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLSoftmaxLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top){
unsigned int channels = bottom[0]->shape(this->softmax_axis_);
- TensorShape shape(channels*this->inner_num_);
- checkreshape(shape,Caffe::arm_gpu_mode());
- if (!this->init_layer_) return;
- this->init_layer_=false;
+ arm_compute::TensorShape shape(channels*this->inner_num_);
+ if (is_operator_init_done(shape)) return;
+ set_operator_init_done();
// Initialize ACL.
- if (Caffe::arm_gpu_mode()) {
- new_gpulayer();
- }else{
- new_cpulayer();
- }
+ new_tensor(input(),shape,InputdataPtr(this,bottom));
+ new_tensor(output(),shape,OutputdataPtr(this,top));
+ acl_configure(softmax,this,NULL);
- //this->force_bypass_acl_path_=false;
- if (Caffe::arm_gpu_mode()) {
- Dtype *top_data = top[0]->mutable_gpu_data();
- const Dtype* bottom_data = bottom[0]->gpu_data();
- new_tensor(this->gpu().input,shape,(void*)bottom_data);
- new_tensor(this->gpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->gpu().layer->configure(this->gpu().input,this->gpu().output);
- }else{
- Dtype *top_data = top[0]->mutable_cpu_data();
- const Dtype* bottom_data = bottom[0]->cpu_data();
- new_tensor(this->cpu().input,shape,(void*)bottom_data);
- new_tensor(this->cpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
- logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
- this->cpu().layer->configure(this->cpu().input,this->cpu().output);
- }
}
template <typename Dtype>
void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -57,29 +34,36 @@ void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLSoftmaxLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_ || this->inner_num_>1) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+
+template <typename Dtype>
void ACLSoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- if(Caffe::arm_gpu_mode()){
+ if(isGPUMode()){
Forward_gpu(bottom, top);
return;
}
#ifdef USE_PROFILING
logtime_util log_time(ACL_SOFTMAX_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_ || this->inner_num_>1) {
+ if (Bypass_acl(bottom,top)) {
SoftmaxLayer<Dtype>::Forward_cpu(bottom,top);
return ;
}
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
- SetupACLLayer(bottom,top);
+ SetupACLOperator(bottom,top);
int channels = bottom[0]->shape(this->softmax_axis_);
for (int i = 0; i < this->outer_num_; ++i) {
- tensor_mem(this->cpu().input,(void*)(bottom_data));
- cpu_run();
- tensor_mem((void*)(top_data),this->cpu().output);
+ acl_run((void*)bottom_data,(void*)top_data);
top_data += channels;
bottom_data += channels;
}
@@ -91,17 +75,15 @@ void ACLSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_SOFTMAX_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_|| this->inner_num_>1) {
+ if (Bypass_acl(bottom,top)) {
SoftmaxLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
const Dtype* bottom_data = bottom[0]->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
- SetupACLLayer(bottom,top);
+ SetupACLOperator(bottom,top);
for (int i = 0; i < this->outer_num_; ++i) {
- tensor_mem(this->gpu().input,(void*)(bottom_data));
- gpu_run();
- tensor_mem((void*)(top_data),this->gpu().output);
+ acl_run((void*)bottom_data,(void*)top_data);
top_data += this->inner_num_;
bottom_data += this->inner_num_;
}
diff --git a/src/caffe/layers/acl_tanh_layer.cpp b/src/caffe/layers/acl_tanh_layer.cpp
index a1bb632c..9c1066e1 100644
--- a/src/caffe/layers/acl_tanh_layer.cpp
+++ b/src/caffe/layers/acl_tanh_layer.cpp
@@ -14,9 +14,9 @@ void ACLTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
-void ACLTanHLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
- const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){
- ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::TANH);
+void ACLTanHLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::TANH);
}
template <typename Dtype>
@@ -27,12 +27,20 @@ void ACLTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
}
template <typename Dtype>
+bool ACLTanHLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+ bool bypass_acl=false;
+ if (this->force_bypass_acl_path_) {
+ bypass_acl=true;
+ }
+ return bypass_acl;
+}
+template <typename Dtype>
void ACLTanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_TANH_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
TanHLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
@@ -45,7 +53,7 @@ void ACLTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
#ifdef USE_PROFILING
logtime_util log_time(ACL_TANH_INFO);
#endif //USE_PROFILING
- if (this->force_bypass_acl_path_) {
+ if (Bypass_acl(bottom,top)) {
TanHLayer<Dtype>::Forward_cpu(bottom,top);
return;
}
diff --git a/tools/extra/tpi.py b/tools/extra/tpi.py
new file mode 100644
index 00000000..f455ed9b
--- /dev/null
+++ b/tools/extra/tpi.py
@@ -0,0 +1,213 @@
+import sys
+import os
+import re
+import pdb
+import xlwt
+
+help_ = '''
+Usage:
+ python tpi.py log.txt
+'''
+
+#data_list= {}
+data_list1= []
+data_list2= []
+cnt=0
+times=0.0
+table_val=''
+name_list1= ['allocate','run','configure','tensor_copy','ACL_CONV','ACL_FC','ACL_LRN','ACL_POOLING','ACL_RELU','ACL_SOFTMAX']
+
+
+
+
+def getvalpairs(words):
+ name=''
+ val=''
+ for word in words:
+ if word=='':
+ continue
+ if name=='':
+ name=word
+ else:
+ val=word
+ break;
+ #print word,
+ #print ''
+ return (name,val)
+
+def addpairstolist(db,name,val,idx):
+ #pdb.set_trace()
+ #if idx in db:
+ # db[idx]['val'] += val
+ #else:
+ # db[idx] = {'val':val,'name':name}
+
+ #pdb.set_trace()
+
+ for i in db:
+ if i['name']==name:
+ i['val'] += val
+ return
+ db.append({'idx':idx,'val':val,'name':name})
+
+def gettabnum(line):
+ start=line.find(':')
+ if start==-1:
+ start=0
+ else:
+ start+=1
+ #pdb.set_trace()
+ str=line[start:-1].lstrip(' ')
+ words=re.split('\t',str)
+ idx=0
+ for word in words:
+ idx+=1
+ if word=='':
+ continue
+ break
+ return idx
+
+def decodefile(logfile):
+ data_list=data_list1
+ for line in open(logfile):
+ if line.find(':')==-1:
+ continue
+ #pdb.set_trace()
+ #print line,
+ idx=gettabnum(line)
+ words=re.split('\t|:| |\r|\n',line)
+ #print(words)
+ (name,val)=getvalpairs(words)
+ #print (name,float(val),eval(val))
+ if name == 'used' and val == 'time':
+ data_list=data_list2
+ try:
+ addpairstolist(data_list,name,float(val),idx)
+ except ValueError as e:
+ #print(line)
+ continue
+
+def printresult(db):
+ #for i in db:
+ # print i, db[i]['idx'],db[i]['val']
+ #pdb.set_trace()
+ db.sort(key=lambda obj:obj.get('idx'), reverse=False)
+ tpi_start=0
+ conv_str='ACL_CONV'
+ find_acl = 0
+ name_index=0
+ global trow
+ global tcol
+ for i in db:
+ if i['name']==conv_str:
+ tpi_start=i['idx']
+
+ tpi=0
+ for i in db:
+ if i['idx']>=tpi_start:
+ tpi+=i['val']
+
+ start=len('ACL_')
+
+ table_head='TPI'+'\t'
+ table_val='%.4f' % (tpi/times)+'\t'
+
+ for i in db:
+ #print i
+ if i['idx']<tpi_start:
+ if i['name'].find('ACL_')==0:
+ table_head+=i['name'][start:]+'\t'
+ else:
+ table_head+=i['name']+'\t'
+ table_val+='%.4f' % (i['val']/times)+'\t'
+
+ print(table_head)
+ print(table_val)
+
+ table_head='TPI'+'\t'
+ table_val='%.4f' % (tpi/times)+'\t'
+
+ for i in db:
+ if i['idx']>=tpi_start:
+ if i['name'].find('ACL_')==0:
+ #pdb.set_trace()
+ table_head+=i['name'][start:]+'\t'
+ else:
+ table_head+=i['name']+'\t'
+ table_val+='%.4f' % (i['val']/times)+'\t'
+
+ print(table_head)
+ print(table_val)
+
+ ws.write(trow, tcol, 'TPI')
+ ws.write(trow+1,tcol,'%.4f' % (tpi/times))
+ tcol+=1
+
+ temp_row=trow
+ temp_col=tcol
+ for i in name_list1:
+ if i.find('ACL_')==0 and find_acl==0:
+ temp_row+=2
+ temp_col=2
+ find_acl=1
+ ws.write(temp_row,temp_col,i)
+ ws.write(temp_row+1,temp_col,'0')
+ temp_col+=1
+ find_acl=0
+
+ for i in db:
+ curname=i['name']
+ curvalue='%.4f' % (i['val']/times)
+ if curname == 'ACL_BN':
+ ws.write(trow+2,7,curname)
+ ws.write(trow+3,7,curvalue)
+
+ if curname in name_list1:
+ val_col=name_list1.index(curname)+2
+ val_row=trow
+ # print ('name found'+ curname + curvalue)
+ # print(val_col)
+ # print (val_row)
+ if val_col>5:
+ val_col-=4
+ val_row+=2
+ ws.write(val_row,val_col,curname)
+ ws.write(val_row+1,val_col,curvalue)
+
+ tcol=0
+ trow+=4
+
+
+if __name__ == '__main__' :
+ if len(sys.argv) < 2:
+ print(help_)
+ sys.exit()
+ else:
+ logfile = sys.argv[1]
+
+ filename = os.path.basename(logfile)
+ decodefile(logfile)
+
+ wb = xlwt.Workbook()
+ ws = wb.add_sheet('testsheet',True)
+ trow = 0
+ tcol = 0
+ cnt=1
+ times=1.0
+ table_val=''
+ print('1st time:')
+ ws.write(trow,tcol,'1st time')
+ tcol+=1
+ printresult(data_list1)
+
+ cnt=2
+ times=10.0
+ table_val=''
+ print('\nAverage of 2-11 times:')
+ ws.write(trow, tcol, '2-11 times')
+ tcol+=1
+ printresult(data_list2)
+ wb.save(filename+'.xls')
+ print ('Xls file generated')
+
+