diff options
author | huifang <huifangzhang@openailab> | 2018-01-31 19:00:24 +0800 |
---|---|---|
committer | huifang <huifangzhang@openailab> | 2018-01-31 19:00:24 +0800 |
commit | 1224a143fc631f9f004881dba7a32c6f7ed5e1a5 (patch) | |
tree | 2f41c58f0d40c9c76b4df1762c09597397629648 | |
parent | 7e51f0ff47b9a74f949f5b9ca448d56b6ba3eba5 (diff) | |
download | caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.gz caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.bz2 caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.zip |
v0.5.0
49 files changed, 2591 insertions, 1546 deletions
@@ -329,7 +329,7 @@ ifeq ($(DEBUG), 1) COMMON_FLAGS += -DDEBUG -g -O0 NVCCFLAGS += -G else - COMMON_FLAGS += -DNDEBUG -O2 + COMMON_FLAGS += -DNDEBUG -O3 endif # cuDNN acceleration configuration. @@ -352,6 +352,10 @@ ifeq ($(USE_ACL), 1) COMMON_FLAGS += -DUSE_ACL -std=c++11 endif +ifeq ($(USE_OPENCL), 1) + COMMON_FLAGS += -DUSE_OPENCL +endif + #USE_PROFILING -- get profiling informations, is controled by LOGACL #LAYER_PERF_STAT -- haitao's net profiling information ifeq ($(USE_PROFILING), 1) diff --git a/Makefile.config.acl b/Makefile.config.acl index f20326f9..8bbce0a2 100644 --- a/Makefile.config.acl +++ b/Makefile.config.acl @@ -1,18 +1,19 @@ ## Refer to http://caffe.berkeleyvision.org/installation.html # Contributions simplifying and improving our build system are welcome! -AIDDIR=/usr/local/AID -export PKG_CONFIG_PATH=${AIDDIR}/opencv3.3.0/lib/pkgconfig - # cuDNN acceleration switch (uncomment to build with cuDNN). # USE_CUDNN := 1 +AIDDIR=/usr/local/AID +export PKG_CONFIG_PATH=${AIDDIR}/opencv3.3.0/lib/pkgconfig + # CPU-only switch (uncomment to build without GPU support). CPU_ONLY := 1 USE_PROFILING := 0 USE_ACL :=1 +USE_OPENCL:=1 ACL_ROOT :=$(AIDDIR)/ComputeLibrary ACL_INCS :=$(ACL_ROOT)/include ACL_INCS +=$(ACL_ROOT) @@ -143,3 +144,11 @@ TEST_GPUID := 0 # enable pretty build (comment to see full commands) Q ?= @ + +ifeq ($(wildcard $(AIDDIR)),) +ACL_ROOT :=$(shell pwd)/../ComputeLibrary +ACL_INCS :=$(ACL_ROOT)/include +ACL_INCS +=$(ACL_ROOT) +ACL_LIBS_DIR :=$(ACL_ROOT)/build +OPENCV_VERSION := +endif @@ -1,5 +1,4 @@ - # CaffeOnACL [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE) @@ -11,9 +10,10 @@ The release version is 0.4.0, is based on [Rockchip RK3399](http://www.rock-chip * Caffe is a fast open framework for deep learning. See also [Caffe](https://github.com/BVLC/caffe). ### Documents -* [Installation instructions](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/installation.md) -* [User Manuals PDF](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/user_manual.pdf) -* [Performance Report PDF](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/performance_report.pdf) +* [Installation instructions](acl_openailab/installation.md) +* [User Manuals PDF](acl_openailab/user_manual.pdf) +* [Performance Report PDF](acl_openailab/performance_report.pdf) +* [Accuracy Report PDF](acl_openailab/accuracy_report.pdf) ### Arm Compute Library Compatibility Issues : There are some compatibility issues between ACL and Caffe Layers, we bypass it to Caffe's original layer class as the workaround solution for the below issues diff --git a/acl_openailab/accuracy_report.pdf b/acl_openailab/accuracy_report.pdf Binary files differnew file mode 100644 index 00000000..75d9c08f --- /dev/null +++ b/acl_openailab/accuracy_report.pdf diff --git a/acl_openailab/installation.md b/acl_openailab/installation.md index 75bb62bb..f97c1040 100644 --- a/acl_openailab/installation.md +++ b/acl_openailab/installation.md @@ -21,10 +21,11 @@ This Installation will help you get started to setup CaffeOnACL on RK3399 quickl wget --no-check-certificate https://github.com/opencv/opencv/archive/3.3.0.tar.gz tar -xvf 3.3.0.tar.gz #### Download "gen-pkg-config-pc" - wget https://github.com/OAID/AID-tools/raw/master/script/gen-pkg-config-pc.sh + wget ftp://ftp.openailab.net/tools/script/gen-pkg-config-pc.sh + chmod +x ./gen-pkg-config-pc.sh #### Download "ACL" git clone https://github.com/ARM-software/ComputeLibrary.git - git checkout bf8b01d + git checkout 48bc34e #### Download "CaffeOnACL" : git clone https://github.com/OAID/CaffeOnACL.git #### Download "Googletest" : @@ -45,14 +46,16 @@ This Installation will help you get started to setup CaffeOnACL on RK3399 quickl mkdir build aarch64-linux-gnu-gcc opencl-1.2-stubs/opencl_stubs.c -Iinclude -shared -o build/libOpenCL.so scons Werror=1 -j4 debug=0 asserts=1 neon=1 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a - wget https://github.com/OAID/AID-tools/raw/master/script/Computelibrary/Makefile + wget ftp://ftp.openailab.net/tools/script/Computelibrary/Makefile sudo make install + sudo ~/gen-pkg-config-pc.sh /usr/local/AID ## 3.3 Build Caffe : cd ~/CaffeOnACL make all make distribute sudo make install + sudo ~/gen-pkg-config-pc.sh /usr/local/AID ## 3.4 Build Unit tests ##### Build the gtest libraries diff --git a/acl_openailab/performance_report.pdf b/acl_openailab/performance_report.pdf Binary files differindex 7bffc1d8..4cc1b531 100644 --- a/acl_openailab/performance_report.pdf +++ b/acl_openailab/performance_report.pdf diff --git a/acl_openailab/user_manual.pdf b/acl_openailab/user_manual.pdf Binary files differindex 25530f04..aadfe38e 100644 --- a/acl_openailab/user_manual.pdf +++ b/acl_openailab/user_manual.pdf diff --git a/data/ilsvrc12/get_ilsvrc_aux.sh b/data/ilsvrc12/get_ilsvrc_aux.sh index dc0d0a72..835212bd 100644 --- a/data/ilsvrc12/get_ilsvrc_aux.sh +++ b/data/ilsvrc12/get_ilsvrc_aux.sh @@ -18,4 +18,4 @@ echo "Unzipping..." tar -xf caffe_ilsvrc12.tar.gz && rm -f caffe_ilsvrc12.tar.gz -echo "Done." +echo "Done. "
\ No newline at end of file diff --git a/examples/cpp_classification/classification_profiling_schedule.cpp b/examples/cpp_classification/classification_profiling_schedule.cpp new file mode 100644 index 00000000..91fff5f7 --- /dev/null +++ b/examples/cpp_classification/classification_profiling_schedule.cpp @@ -0,0 +1,547 @@ +#include <caffe/caffe.hpp> +#ifdef USE_OPENCV +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#endif // USE_OPENCV +#include <algorithm> +#include <iosfwd> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#ifdef USE_PROFILING + +#include <iostream> + +#include <time.h> + +#define REPEAT_TEST + +unsigned long get_cur_time(void) +{ + struct timespec tm; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &tm); + + return (tm.tv_sec*1000+tm.tv_nsec/1000000); +} + +#endif //USE_PROFILING + +#ifdef USE_OPENCV +using namespace caffe; // NOLINT(build/namespaces) +using std::string; + +/* Pair (label, confidence) representing a prediction. */ +typedef std::pair<string, float> Prediction; + +class Classifier { + public: + Classifier(const string& model_file, + const string& trained_file, + const string& mean_file, + const string& label_file); + + std::vector<Prediction> Classify(const cv::Mat& img, int N = 5); + +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT + void dump_perf_stat(void); + void dump_single_layer_io(int idx, Layer<float> * p_layer); + void dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time); +#ifdef REPEAT_TEST + void collect_layer_stat(vector<vector<perf_stat> * > & all_stat); + void dump_all_stat(vector <vector<perf_stat>*>& all_stat); + void reset_layer_stat(); +#endif +#endif + +#endif //USE_PROFILING + + private: + void SetMean(const string& mean_file); + + std::vector<float> Predict(const cv::Mat& img); + + void WrapInputLayer(std::vector<cv::Mat>* input_channels); + + void Preprocess(const cv::Mat& img, + std::vector<cv::Mat>* input_channels); + + private: + shared_ptr<Net<float> > net_; + cv::Size input_geometry_; + int num_channels_; + cv::Mat mean_; + std::vector<string> labels_; +}; + +Classifier::Classifier(const string& model_file, + const string& trained_file, + const string& mean_file, + const string& label_file) { +#ifdef CPU_ONLY + Caffe::set_mode(Caffe::CPU); +#else + Caffe::set_mode(Caffe::GPU); +#endif + + AclEnableSchedule(); + /* Load the network. */ + net_.reset(new Net<float>(model_file, TEST)); + net_->CopyTrainedLayersFrom(trained_file); + + CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input."; + CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output."; + + Blob<float>* input_layer = net_->input_blobs()[0]; + num_channels_ = input_layer->channels(); + CHECK(num_channels_ == 3 || num_channels_ == 1) + << "Input layer should have 1 or 3 channels."; + input_geometry_ = cv::Size(input_layer->width(), input_layer->height()); + + /* Load the binaryproto mean file. */ + SetMean(mean_file); + + /* Load labels. */ + std::ifstream labels(label_file.c_str()); + CHECK(labels) << "Unable to open labels file " << label_file; + string line; + while (std::getline(labels, line)) + labels_.push_back(string(line)); + + Blob<float>* output_layer = net_->output_blobs()[0]; + CHECK_EQ(labels_.size(), output_layer->channels()) + << "Number of labels is different from the output layer dimension."; +} + +static bool PairCompare(const std::pair<float, int>& lhs, + const std::pair<float, int>& rhs) { + return lhs.first > rhs.first; +} + +/* Return the indices of the top N values of vector v. */ +static std::vector<int> Argmax(const std::vector<float>& v, int N) { + std::vector<std::pair<float, int> > pairs; + for (size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector<int> result; + for (int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + +/* Return the top N predictions. */ +std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) { + std::vector<float> output = Predict(img); + + N = std::min<int>(labels_.size(), N); + std::vector<int> maxN = Argmax(output, N); + std::vector<Prediction> predictions; + for (int i = 0; i < N; ++i) { + int idx = maxN[i]; + predictions.push_back(std::make_pair(labels_[idx], output[idx])); + } + + return predictions; +} + +/* Load the mean file in binaryproto format. */ +void Classifier::SetMean(const string& mean_file) { + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + + /* Convert from BlobProto to Blob<float> */ + Blob<float> mean_blob; + mean_blob.FromProto(blob_proto); + CHECK_EQ(mean_blob.channels(), num_channels_) + << "Number of channels of mean file doesn't match input layer."; + + /* The format of the mean file is planar 32-bit float BGR or grayscale. */ + std::vector<cv::Mat> channels; + float* data = mean_blob.mutable_cpu_data(); + for (int i = 0; i < num_channels_; ++i) { + /* Extract an individual channel. */ + cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data); + channels.push_back(channel); + data += mean_blob.height() * mean_blob.width(); + } + + /* Merge the separate channels into a single image. */ + cv::Mat mean; + cv::merge(channels, mean); + + /* Compute the global mean pixel value and create a mean image + * filled with this value. */ + cv::Scalar channel_mean = cv::mean(mean); + mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean); +} + +std::vector<float> Classifier::Predict(const cv::Mat& img) { + Blob<float>* input_layer = net_->input_blobs()[0]; + input_layer->Reshape(1, num_channels_, + input_geometry_.height, input_geometry_.width); + /* Forward dimension change to all layers. */ + net_->Reshape(); + + std::vector<cv::Mat> input_channels; + WrapInputLayer(&input_channels); + + Preprocess(img, &input_channels); + +#ifdef USE_PROFILING + unsigned long tstart=get_cur_time(); +#endif //USE_PROFILING + + net_->Forward(); + +#ifdef USE_PROFILING + + unsigned long tend=get_cur_time(); + + std::cout<<"used time: "<<tend-tstart<<std::endl; + +#ifdef LAYER_PERF_STAT + dump_perf_stat(); +#ifdef REPEAT_TEST + + reset_layer_stat(); + + vector<vector<perf_stat>* > all_stat; + int rep_number=10; + + for(int i=0;i<rep_number;i++) + { + net_->Forward(); + collect_layer_stat(all_stat); + reset_layer_stat(); + } + + //dump stats + dump_all_stat(all_stat); + + for(int i=0;i<all_stat.size();i++) + delete all_stat[i]; + +#endif //REPEAT_TEST +#endif //LAYER_PERF_STAT +#endif //USE_PROFILING + + /* Copy the output layer to a std::vector */ + Blob<float>* output_layer = net_->output_blobs()[0]; + const float* begin = output_layer->cpu_data(); + const float* end = begin + output_layer->channels(); + return std::vector<float>(begin, end); +} + +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT + +#ifdef REPEAT_TEST +void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat) +{ + vector<perf_stat > * p_stat; + perf_stat * p_time_stat; + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + + + p_stat=new vector<perf_stat>; + + for (int i =0;i< layers.size(); i++) { + p_time_stat=layers[i]->get_time_stat(); + p_stat->push_back(*p_time_stat); + + } + + all_stat.push_back(p_stat); +} + +void Classifier::reset_layer_stat(void) +{ + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + perf_stat * p_time_stat; + + for (int i =0;i< layers.size(); i++) { + p_time_stat=layers[i]->get_time_stat(); + + p_time_stat->count=0; + p_time_stat->total=0; + p_time_stat->used=p_time_stat->start=p_time_stat->end=0; + } +} + +void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat) +{ + + struct new_perf_stat { + perf_stat stat; + int idx; + }; + + vector<new_perf_stat > layer_stat; + perf_stat * p_stat; + + uint64_t total_time=0; + + layer_stat.resize(all_stat[0]->size()); + + for(int i=0;i<all_stat.size();i++) + { + for(int j=0;j<layer_stat.size();j++) + { + p_stat=&layer_stat[j].stat; + + p_stat->total+=(*all_stat[i])[j].total; + p_stat->count+=(*all_stat[i])[j].count; + total_time+=(*all_stat[i])[j].total; + } + } + + total_time=total_time/all_stat.size(); + + std::cout<<std::endl<<"----------------------------------"<<std::endl; + std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl; + std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl; + std::cout<<"Each layer stats: ..."<<std::endl; + + + for(int i=layer_stat.size()-1;i>=0;i--) + { + p_stat=&layer_stat[i].stat; + + layer_stat[i].idx=i; + + std::cout<<" "<<i<<": used time: "<<p_stat->total/all_stat.size(); + std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100; + std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl; + } + + std::cout<<std::endl; + + std::cout<<"time cost top 10 layers are: ..."<<std::endl; + + std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b) + { + if(a.stat.total>b.stat.total) + return true; + else + return false; + }); + + uint64_t top_total_time=0; + + for(int i=0; i<10; i++) + { + p_stat=&layer_stat[i].stat; + + std::cout<<" "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size(); + std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100; + std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl; + top_total_time+=p_stat->total; + } + + std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl; + + std::cout<<std::endl; +} + +#endif + +void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer) +{ + const LayerParameter& layer_param=p_layer->layer_param(); + + std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name(); + std::cout<<" type: "<<layer_param.type()<<std::endl; + + const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom; + + for(int i=0;i<layer_param.bottom_size(); i++) + { + std::cout<<"bottom "<<layer_param.bottom(i)<<": "; + + Blob<float> * p_blob=(*p_bottom_vec)[i]; + + for(int j=0;j<p_blob->num_axes();j++) + { + std::cout<<p_blob->shape(j)<<" "; + } + std::cout<<std::endl; + } + + const vector<Blob<float>*> *p_top_vec=p_layer->saved_top; + for(int i=0;i<layer_param.top_size(); i++) + { + std::cout<<"top "<<layer_param.top(i)<<": "; + Blob<float> * p_blob=(*p_top_vec)[i]; + + for(int j=0;j<p_blob->num_axes();j++) + { + std::cout<<p_blob->shape(j)<<" "; + } + std::cout<<std::endl; + } +} + +void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time) +{ + const LayerParameter& layer_param=p_layer->layer_param(); + perf_stat * p_time_stat; + + p_time_stat=p_layer->get_time_stat(); + + std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name(); + std::cout<<" type: "<<layer_param.type(); + std::cout<<" ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl; + + + std::cout<<"time stat: total: "<<p_time_stat->total<<" count: "<<p_time_stat->count; + if(p_time_stat->count) + { + std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count; + } + + std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end; + std::cout<<std::endl; + + +} + +void Classifier::dump_perf_stat(void) +{ + uint64_t total_net_time=0; + + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + + std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl; + + for (int i = layers.size() - 1; i >= 0; --i) { + dump_single_layer_io(i,layers[i].get()); + } + + + for (int i = layers.size() - 1; i >= 0; --i) { + + perf_stat * p_time_stat; + + p_time_stat=layers[i]->get_time_stat(); + + total_net_time+=p_time_stat->total; + + } + + std::cout<<"Time for each layer ... sum of all layers is : "; + std::cout<<total_net_time<<std::endl; + + for (int i = layers.size() - 1; i >= 0; --i) { + + dump_single_layer_perf(i,layers[i].get(),total_net_time); + } + +} + +#endif + +#endif //USE_PROFILING + +/* Wrap the input layer of the network in separate cv::Mat objects + * (one per channel). This way we save one memcpy operation and we + * don't need to rely on cudaMemcpy2D. The last preprocessing + * operation will write the separate channels directly to the input + * layer. */ +void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) { + Blob<float>* input_layer = net_->input_blobs()[0]; + + int width = input_layer->width(); + int height = input_layer->height(); + float* input_data = input_layer->mutable_cpu_data(); + for (int i = 0; i < input_layer->channels(); ++i) { + cv::Mat channel(height, width, CV_32FC1, input_data); + input_channels->push_back(channel); + input_data += width * height; + } +} + +void Classifier::Preprocess(const cv::Mat& img, + std::vector<cv::Mat>* input_channels) { + /* Convert the input image to the input image format of the network. */ + cv::Mat sample; + if (img.channels() == 3 && num_channels_ == 1) + cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY); + else if (img.channels() == 4 && num_channels_ == 1) + cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY); + else if (img.channels() == 4 && num_channels_ == 3) + cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR); + else if (img.channels() == 1 && num_channels_ == 3) + cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR); + else + sample = img; + + cv::Mat sample_resized; + if (sample.size() != input_geometry_) + cv::resize(sample, sample_resized, input_geometry_); + else + sample_resized = sample; + + cv::Mat sample_float; + if (num_channels_ == 3) + sample_resized.convertTo(sample_float, CV_32FC3); + else + sample_resized.convertTo(sample_float, CV_32FC1); + + cv::Mat sample_normalized; + cv::subtract(sample_float, mean_, sample_normalized); + + /* This operation will write the separate BGR planes directly to the + * input layer of the network because it is wrapped by the cv::Mat + * objects in input_channels. */ + cv::split(sample_normalized, *input_channels); + + CHECK(reinterpret_cast<float*>(input_channels->at(0).data) + == net_->input_blobs()[0]->cpu_data()) + << "Input channels are not wrapping the input layer of the network."; +} + +int main(int argc, char** argv) { + if (argc != 6) { + std::cerr << "Usage: " << argv[0] + << " deploy.prototxt network.caffemodel" + << " mean.binaryproto labels.txt img.jpg" << std::endl; + return 1; + } + + ::google::InitGoogleLogging(argv[0]); + + string model_file = argv[1]; + string trained_file = argv[2]; + string mean_file = argv[3]; + string label_file = argv[4]; + Classifier classifier(model_file, trained_file, mean_file, label_file); + + string file = argv[5]; + + std::cout << "---------- Prediction for " + << file << " ----------" << std::endl; + + cv::Mat img = cv::imread(file, -1); + CHECK(!img.empty()) << "Unable to decode image " << file; + std::vector<Prediction> predictions = classifier.Classify(img); + + /* Print the top N predictions. */ + for (size_t i = 0; i < predictions.size(); ++i) { + Prediction p = predictions[i]; + std::cout << std::fixed << std::setprecision(4) << p.second << " - \"" + << p.first << "\"" << std::endl; + } +} +#else +int main(int argc, char** argv) { + LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV."; +} +#endif // USE_OPENCV diff --git a/include/caffe/acl_layer.hpp b/include/caffe/acl_layer.hpp deleted file mode 100644 index b188bb8c..00000000 --- a/include/caffe/acl_layer.hpp +++ /dev/null @@ -1,278 +0,0 @@ -#ifndef CAFFE_ACL_LAYER_HPP_ -#define CAFFE_ACL_LAYER_HPP_ - -#ifdef USE_ACL -#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" -#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/CL/functions/CLActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h" -#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h" -#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" -#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" -#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" -#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" -#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" -#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h" -#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h" -#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h" -#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h" -#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h" -#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" -#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; -#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001 -#define FLAGS_ENABLE_ACL_BNLL 0x00000002 -#define FLAGS_ENABLE_ACL_CONV 0x00000004 -#define FLAGS_ENABLE_ACL_FC 0x00000008 -#define FLAGS_ENABLE_ACL_LRN 0x00000010 -#define FLAGS_ENABLE_ACL_POOLING 0x00000020 -#define FLAGS_ENABLE_ACL_RELU 0x00000040 -#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080 -#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100 -#define FLAGS_ENABLE_ACL_TANH 0x00000200 -#define FLAGS_ENABLE_ACL_LC 0x00000400 -#define FLAGS_ENABLE_ACL_BN 0x00000800 -#define FLAGS_ENABLE_ACL_CONCAT 0x00001000 -extern unsigned int bypass_acl_class_layer; -#endif -#ifdef USE_PROFILING -#include "layer.hpp" - -#define MASK_LOG_APP_TIME 0x00000001 -#define MASK_LOG_ALLOCATE 0x00000002 -#define MASK_LOG_RUN 0x00000004 -#define MASK_LOG_CONFIG 0x00000008 -#define MASK_LOG_COPY 0x00000010 -#define MASK_LOG_ABSVAL 0x00000020 -#define MASK_LOG_BNLL 0x00000040 -#define MASK_LOG_CONV 0x00000080 -#define MASK_LOG_FC 0x00000100 -#define MASK_LOG_LRN 0x00000200 -#define MASK_LOG_POOLING 0x00000400 -#define MASK_LOG_RELU 0x00000800 -#define MASK_LOG_SIGMOID 0x00001000 -#define MASK_LOG_SOFTMAX 0x00002000 -#define MASK_LOG_TANH 0x00004000 -#define MASK_LOG_LC 0x00008000 -#define MASK_LOG_BN 0x00010000 -#define MASK_LOG_CONCAT 0x00020000 -#define APP_TIME_INFO MASK_LOG_APP_TIME,"time: \t" -#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate: \t\t" -#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t" -#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t" -#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t" -#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t" -#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t" -#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t" -#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t" -#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t" -#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_SIGMOID_INFO MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_SOFTMAX_INFO MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_TANH_INFO MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_BN_INFO MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" -#define ACL_CONCAT_INFO MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" -extern unsigned int acl_log_flags; -#endif //USE_PROFILING -namespace caffe { -#ifdef USE_ACL -enum TensorType{ - tensor_input, - tensor_output, - tensor_weights, - tensor_biases, - tensor_data, -}; -template <typename ACLTensor> -class BaseTensor:public ACLTensor{ -public: - BaseTensor(bool share) - :share_(share),type_(tensor_input),allocate_(false){ - } - virtual void bindmem(void *mem,bool share){ - mem_=mem; - share_=share; - } - virtual void settensortype(TensorType type){ - type_=type; - }; - virtual void map(bool blocking = true){} - virtual void unmap(){} - virtual void commit(TensorType type=tensor_data); - int tensor_copy(void * mem, bool toTensor=true); -protected: - void* mem_; - bool share_; - TensorType type_; - bool allocate_; -}; -class GPUTensor:public BaseTensor<CLTensor>{ -public: - explicit GPUTensor(bool share) - :BaseTensor(share){} - virtual void map(bool blocking = true){ - if (!allocate_){ - CLTensor::allocator()->allocate(); - allocate_=true; - } - CLTensor::map(blocking); - } - virtual void unmap(){ - CLTensor::unmap(); - } -}; -class CPUTensor:public BaseTensor<Tensor>{ -public: - explicit CPUTensor(bool share) - :BaseTensor(share){} - virtual void map(bool blocking = true){ - if (!allocate_){ - Tensor::allocator()->allocate(); - allocate_=true; - } - } - virtual void unmap(){ - } -}; -template <typename ACLLayer,typename ACLTensor> -class ACLXPUBaseLayer{ -public: - virtual void commit(){ - if (input) { - input->commit(tensor_input); - } - if (output){ - output->commit(tensor_output); - } - if (weights){ - weights->commit(tensor_weights); - } - if (biases){ - biases->commit(tensor_biases); - } - } - virtual void run(bool gpu){ - commit(); -#ifdef USE_PROFILING - logtime_util log_time(ACL_RUN_INFO); -#endif //USE_PROFILING - layer->run(); - if (gpu) { - // Make sure all the OpenCL jobs are done executing: - CLScheduler::get().sync(); - } - } - virtual bool reshape(TensorShape &shape,TensorType type); - explicit ACLXPUBaseLayer(){ - layer=nullptr; - input=nullptr; - output=nullptr; - weights=nullptr; - biases=nullptr; - mean=nullptr; - var=nullptr; - beta=nullptr; - gamma=nullptr; -#ifdef USE_CONV_CACHE - for(int i = 0; i < 16; ++i){ - cache.layer[i] = nullptr; - cache.input[i] = nullptr; - cache.output[i] = nullptr; - cache.weights[i] = nullptr; - cache.biases[i] = nullptr; - } -#endif //USE_CONV_CACHE - } - virtual void freelayer(){ -#ifndef USE_CONV_CACHE - if (layer) delete layer; - if (input) delete input; - if (output) delete output; - if (weights) delete weights; - if (biases) delete biases; - if (mean) delete mean; - if (var) delete var; - if (beta) delete beta; - if (gamma) delete gamma; -#endif //USE_CONV_CACHE - layer=nullptr; - input=nullptr; - output=nullptr; - weights=nullptr; - biases=nullptr; - mean=nullptr; - var=nullptr; - beta=nullptr; - gamma=nullptr; - } - virtual ~ACLXPUBaseLayer(){ - freelayer(); - } - ACLLayer *layer; - ACLTensor *input; - ACLTensor *output; - ACLTensor *weights; - ACLTensor *biases; - //for BN - ACLTensor *mean; - ACLTensor *var; - ACLTensor *beta; - ACLTensor *gamma; -#ifdef USE_CONV_CACHE - struct{ - ACLLayer *layer[16]; - ACLTensor *input[16]; - ACLTensor *output[16]; - ACLTensor *weights[16]; - ACLTensor *biases[16]; - }cache; -#endif //USE_CONV_CACHE -}; -template <typename GPULayer, typename CPULayer> -class ACLBaseLayer { -public: - explicit ACLBaseLayer(); - virtual void gpu_run(); - virtual void cpu_run(); - virtual ~ACLBaseLayer(); - virtual GPULayer * new_gpulayer(); - virtual CPULayer * new_cpulayer(); - ACLXPUBaseLayer<GPULayer,GPUTensor>& gpu(){ - return gpu_; - } - ACLXPUBaseLayer<CPULayer,CPUTensor>& cpu(){ - return cpu_; - } - bool checkreshape(TensorShape shape,bool gpu=false, TensorType type=tensor_input); - template <typename ACLTensor> bool tensor_mem(ACLTensor *tensor,void *mem,bool share=false); - template <typename ACLTensor> bool tensor_mem(void *mem,ACLTensor *tensor,bool share=false); - template <typename ACLTensor> bool new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem=nullptr,bool share=false); -protected: - ACLXPUBaseLayer<GPULayer,GPUTensor> gpu_; - ACLXPUBaseLayer<CPULayer,CPUTensor> cpu_; - bool init_layer_; - bool force_bypass_acl_path_; - -}; - -#endif -} -#define INSTANTIATE_ACLBASECLASS(GPULayer,CPULayer) \ - template class ACLBaseLayer<GPULayer,CPULayer>; - -#define INSTANTIATE_ACLBASE_FUNCTION(GPULayer,CPULayer,ACLTensor) \ - template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share); \ - template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share); \ - template bool ACLBaseLayer<GPULayer,CPULayer>::new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem,bool share); \ - - -#endif diff --git a/include/caffe/acl_operator.hpp b/include/caffe/acl_operator.hpp new file mode 100644 index 00000000..90051054 --- /dev/null +++ b/include/caffe/acl_operator.hpp @@ -0,0 +1,718 @@ +#ifndef CAFFE_ACL_LAYER_HPP_ +#define CAFFE_ACL_LAYER_HPP_ + +#ifdef USE_ACL +#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h" +#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h" +#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLActivationLayer.h" +#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" +#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "acl_tensor.hpp" +#include "caffe/common.hpp" +#include "caffe/layer.hpp" +#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001 +#define FLAGS_ENABLE_ACL_BNLL 0x00000002 +#define FLAGS_ENABLE_ACL_CONV 0x00000004 +#define FLAGS_ENABLE_ACL_FC 0x00000008 +#define FLAGS_ENABLE_ACL_LRN 0x00000010 +#define FLAGS_ENABLE_ACL_POOLING 0x00000020 +#define FLAGS_ENABLE_ACL_RELU 0x00000040 +#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080 +#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100 +#define FLAGS_ENABLE_ACL_TANH 0x00000200 +#define FLAGS_ENABLE_ACL_LC 0x00000400 +#define FLAGS_ENABLE_ACL_BN 0x00000800 +#define FLAGS_ENABLE_ACL_CONCAT 0x00001000 +extern unsigned int bypass_acl_class_layer; +extern unsigned int openailab_intfp; +#endif +#ifdef USE_PROFILING +#include "layer.hpp" + +#define MASK_LOG_APP_TIME 0x00000001 +#define MASK_LOG_ALLOCATE 0x00000002 +#define MASK_LOG_RUN 0x00000004 +#define MASK_LOG_CONFIG 0x00000008 +#define MASK_LOG_COPY 0x00000010 +#define MASK_LOG_ABSVAL 0x00000020 +#define MASK_LOG_BNLL 0x00000040 +#define MASK_LOG_CONV 0x00000080 +#define MASK_LOG_FC 0x00000100 +#define MASK_LOG_LRN 0x00000200 +#define MASK_LOG_POOLING 0x00000400 +#define MASK_LOG_RELU 0x00000800 +#define MASK_LOG_SIGMOID 0x00001000 +#define MASK_LOG_SOFTMAX 0x00002000 +#define MASK_LOG_TANH 0x00004000 +#define MASK_LOG_LC 0x00008000 +#define MASK_LOG_BN 0x00010000 +#define MASK_LOG_CONCAT 0x00020000 +#define APP_TIME_INFO MASK_LOG_APP_TIME,"time: \t" +#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate: \t\t" +#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t" +#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t" +#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t" +#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t" +#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t" +#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t" +#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t" +#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t" +#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_SIGMOID_INFO MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_SOFTMAX_INFO MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_TANH_INFO MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_BN_INFO MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_CONCAT_INFO MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +extern unsigned int acl_log_flags; +#endif //USE_PROFILING +namespace caffe { +#ifdef USE_ACL +enum TensorType{ + tensor_input, + tensor_output, + tensor_weights, + tensor_biases, + tensor_mean, + tensor_var, + tensor_beta, + tensor_gamma, + tensor_concat, + tensor_data, +}; +enum OperatorState{ + operator_not_init, + operator_init_done, + operator_reinit, +}; +enum OperateType{ + operate_type_pooling, + operate_type_activation, + operate_type_lrn, + operate_type_conv, + operate_type_lc, + operate_type_fc, + operate_type_bn, + operate_type_softmax, + operate_type_concat, +}; +class BaseACLTensor{ +public: + BaseACLTensor() + :type_(tensor_input),allocate_(false){ + } + virtual void bindmem(void *mem){ + mem_=mem; + } + virtual void settensortype(TensorType type){ + type_=type; + }; + virtual void map(bool blocking = true){ + } + virtual void unmap(){} + virtual void commit(TensorType type=tensor_data){} + int tensor_copy(arm_compute::ITensor* tensor,void * mem, bool toTensor=true); +protected: + void* mem_; + TensorType type_; + bool allocate_; +}; +class ACLTensor:public BaseACLTensor,public Tensor{ +public: + ACLTensor(arm_compute::TensorInfo &&info) + :Tensor(info){ + } + virtual void map(bool blocking = true){ + if (!allocate_){ + Tensor::allocate(); + allocate_=true; + } + Tensor::map(blocking); + } + virtual int tensor_copy(void * mem, bool toTensor=true){ + auto acl_tensor=this; + arm_compute::ITensor* tensor=acl_tensor->tensor(); + BaseACLTensor::tensor_copy(tensor,mem,toTensor); + return 0; + } + virtual void unmap(){Tensor::unmap();} + virtual void commit(TensorType type=tensor_data); +}; +class ACLSubTensor:public BaseACLTensor,public SubTensor{ +public: + ACLSubTensor(std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord) + :SubTensor(parent.get(),shape,coord){ + } + virtual int tensor_copy(void * mem, bool toTensor=true){ + return 0; + } +}; + +template <typename T> +class TensorPair{ +public: + TensorPair(){} + ~TensorPair(){} + TensorType type; + std::unique_ptr<T> tensor; +}; +template <typename T> +std::unique_ptr<T> &tensor_item(std::vector<std::unique_ptr<TensorPair<T>>>& pool,TensorType type,int idx){ + int count=0; + for (auto &item: pool) { + if(item.get()->type==type){ + ++count; + } + if(item.get()->type==type && idx==count-1){ + return item.get()->tensor; + } + } + pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>)); + auto item=pool[pool.size()-1].get(); + item->type=type; + item->tensor=NULL; + return item->tensor; +} +class ACLOperator { +public: + virtual void commit(){ + for (auto & item: tensor_pool_) { + if(item.get()->tensor)item.get()->tensor->commit(item.get()->type); + } + } + inline void run(){ + commit(); + #ifdef USE_PROFILING + logtime_util log_time(ACL_RUN_INFO); + #endif //USE_PROFILING + for(auto &c : funcs_) + { + c->run(); + } + } + + inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs(){return funcs_;} + + inline std::unique_ptr<ACLSubTensor> &sinput(int idx=0){return tensor_item(subtensor_pool_,tensor_input,idx);} + inline std::unique_ptr<ACLSubTensor> &soutput(int idx=0){return tensor_item(subtensor_pool_,tensor_output,idx);} + inline std::unique_ptr<ACLSubTensor> &sweights(int idx=0){return tensor_item(subtensor_pool_,tensor_weights,idx);} + inline std::unique_ptr<ACLSubTensor> &sbiases(int idx=0){return tensor_item(subtensor_pool_,tensor_biases,idx);} + + inline std::unique_ptr<ACLTensor> &cinput(int idx=0){return tensor_item(tensor_pool_,tensor_concat,idx);} + inline std::unique_ptr<ACLTensor> &input(int idx=0){return tensor_item(tensor_pool_,tensor_input,idx);} + inline std::unique_ptr<ACLTensor> &output(int idx=0){return tensor_item(tensor_pool_,tensor_output,idx);} + inline std::unique_ptr<ACLTensor> &weights(int idx=0){return tensor_item(tensor_pool_,tensor_weights,idx);} + inline std::unique_ptr<ACLTensor> &biases(int idx=0){return tensor_item(tensor_pool_,tensor_biases,idx);} + inline std::unique_ptr<ACLTensor> &mean(int idx=0){return tensor_item(tensor_pool_,tensor_mean,idx);} + inline std::unique_ptr<ACLTensor> &var(int idx=0){return tensor_item(tensor_pool_,tensor_var,idx);} + inline std::unique_ptr<ACLTensor> &beta(int idx=0){return tensor_item(tensor_pool_,tensor_beta,idx);} + inline std::unique_ptr<ACLTensor> &gamma(int idx=0){return tensor_item(tensor_pool_,tensor_gamma,idx);} + inline std::unique_ptr<ACLTensor> &tensor(TensorType type){ + switch (type) { + case tensor_biases: + return biases(); + break; + case tensor_weights: + return weights(); + break; + case tensor_output: + return output(); + break; + default: + case tensor_input: + return input(); + break; + } + return input(); + } + + + explicit ACLOperator(const LayerParameter& param); + virtual ~ACLOperator(); + inline TargetHint getTargetHint(){ +#ifdef USE_OPENCL + if (target_hint_==TargetHint::DONT_CARE) { + if (Caffe::arm_gpu_mode()) { + return TargetHint::OPENCL; + } + return TargetHint::NEON; + } + return target_hint_; +#else + return TargetHint::NEON; +#endif + } + inline void setTargetHint(TargetHint hint){ + target_hint_=hint; + } + inline ConvolutionMethodHint & getConvMethod(){ return convolution_method_hint_;} + inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor,void *mem){ + tensor->bindmem(mem); + return true; + } + inline bool tensor_mem(void *mem,std::unique_ptr<ACLTensor> &tensor){ + tensor->tensor_copy(mem,false); + return true; + } + bool new_tensor(std::unique_ptr<ACLTensor> &tensor,arm_compute::TensorShape &shape,void *mem=nullptr,bool commit=false); + bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord); + inline int & group(){return _group;} + inline void set_operator_property(OperateType type,const char*name){ + name_=name; + type_=type; + } + inline void acl_run(void *input_data, void *output_data){ + if(input_data)tensor_mem(input(),input_data); + run(); + tensor_mem(output_data,output()); + } + + +protected: + inline bool isGPUMode(){ + if (!support_opencl_) return false; + return getTargetHint()==TargetHint::OPENCL; + } + inline OperatorState & opstate(){return operator_state_;} + inline bool is_operator_init_done(arm_compute::TensorShape shape,TensorType type=tensor_input){ + checkreshape(shape,type); + return operator_state_==operator_init_done; + } + inline void set_operator_init_done(){ + opstate()=operator_init_done; + set_bypass_state(false); + } + inline void set_bypass_state(bool state=false){ + force_bypass_acl_path_=state; + } + inline OperatorState checkreshape(arm_compute::TensorShape shape,TensorType type=tensor_input){ + opstate()=reshape(shape,type); + if (opstate()==operator_reinit) { + freeres(); + } + return opstate(); + } + inline OperatorState reshape(arm_compute::TensorShape &shape,TensorType type){ + arm_compute::TensorShape _shape; + std::unique_ptr<ACLTensor> &acl_tensor=tensor(type); + if (!acl_tensor.get()) return operator_not_init; + _shape = acl_tensor->info().tensor_shape(); + if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { + return operator_init_done; + } + return operator_reinit; + } + inline void freeres(){ + tensor_pool_.clear(); + subtensor_pool_.clear(); + funcs_.clear(); + } + inline const char* &name(){return name_;} + +protected: + std::vector<std::unique_ptr<TensorPair<ACLTensor>>>tensor_pool_; + std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>>subtensor_pool_; + std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_; + OperatorState operator_state_; + bool force_bypass_acl_path_; + TargetHint target_hint_; + ConvolutionMethodHint convolution_method_hint_; + static bool support_opencl_; + static bool init_cl_env; + int _group; + const char* name_; + OperateType type_; +}; + +int isScheduleEnable(); + +template <typename OperatorType, typename TensorType> +std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output){ + auto op = cpp14::make_unique<OperatorType>(); + op->configure( + dynamic_cast<TensorType *>(input), + dynamic_cast<TensorType *>(output) + ); + + return std::move(op); +} + +template <typename OperatorType, typename TensorType> +std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *output) +{ + return instantiate_function<OperatorType, TensorType>(input, output); +} + +template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor> +std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input, std::unique_ptr<ACLTensor>& output,TargetHint& hint){ + std::unique_ptr<arm_compute::IFunction> func; +#ifdef USE_OPENCL + if(hint == TargetHint::OPENCL) + { + func = instantiate<GPUOpType, GPUTensor>(input->tensor(), output->tensor()); + } + else +#endif + { + func = instantiate<CPUOpType, CPUTensor>(input->tensor(), output->tensor()); + } + return func; +} + + +template <typename OperatorType, typename TensorType,typename VectorTensor> +std::unique_ptr<arm_compute::IFunction> instantiate_function(VectorTensor inputs, arm_compute::ITensor *output){ + auto op = cpp14::make_unique<OperatorType>(); + op->configure( + inputs, + dynamic_cast<TensorType *>(output) + ); + + return std::move(op); +} + +template <typename OperatorType, typename TensorType,typename VectorTensor> +std::unique_ptr<arm_compute::IFunction> instantiate(VectorTensor inputs, arm_compute::ITensor *output) +{ + return instantiate_function<OperatorType, TensorType,VectorTensor>(inputs, output); +} + +template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor> +std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(ACLOperator*& acl_op, std::unique_ptr<ACLTensor>& output,int num,TargetHint& hint){ + std::unique_ptr<arm_compute::IFunction> func; +#ifdef USE_OPENCL + if(hint == TargetHint::OPENCL) + { + static std::vector<arm_compute::ICLTensor*> tensors; + tensors.clear(); + for (int i=0;i<num;++i) { + tensors.push_back(dynamic_cast<arm_compute::ICLTensor*>(acl_op->cinput(i).get()->tensor())); + } + func = instantiate<GPUOpType, GPUTensor, std::vector<arm_compute::ICLTensor *>>(tensors, output->tensor()); + } + else +#endif + { + static std::vector<arm_compute::ITensor*> tensors; + tensors.clear(); + for (int i=0;i<num;++i) { + tensors.push_back(dynamic_cast<arm_compute::ITensor*>(acl_op->cinput(i).get()->tensor())); + } + func = instantiate<CPUOpType, CPUTensor,std::vector<arm_compute::ITensor*>>(tensors, output->tensor()); + } + return func; +} + +template <typename OperatorType, typename TensorType,typename OperatorInfo> +std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output, const OperatorInfo &info){ + auto op = cpp14::make_unique<OperatorType>(); + op->configure( + dynamic_cast<TensorType *>(input), + dynamic_cast<TensorType *>(output), + info); + + return std::move(op); +} + +template <typename OperatorType, typename TensorType,typename OperatorInfo> +std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *output, const OperatorInfo &info) +{ + return instantiate_function<OperatorType, TensorType, OperatorInfo>(input, output, info); +} + +template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor, typename OperatorInfo> +std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input, std::unique_ptr<ACLTensor>& output, const OperatorInfo &info,TargetHint& hint){ + std::unique_ptr<arm_compute::IFunction> func; +#ifdef USE_OPENCL + if(hint == TargetHint::OPENCL) + { + func = instantiate<GPUOpType, GPUTensor,OperatorInfo>(input->tensor(), output->tensor(), info); + } + else +#endif + { + func = instantiate<CPUOpType, CPUTensor,OperatorInfo>(input->tensor(), output->tensor(), info); + } + return func; +} + + +template <typename OperatorType, typename TensorType,typename OperatorInfo> +std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input,arm_compute::ITensor *weights,arm_compute::ITensor *biases, arm_compute::ITensor *output, const OperatorInfo &info){ + auto op = cpp14::make_unique<OperatorType>(); + op->configure( + dynamic_cast<TensorType *>(input), + dynamic_cast<TensorType *>(weights), + dynamic_cast<TensorType *>(biases), + dynamic_cast<TensorType *>(output), + info); + return std::move(op); +} + +template <typename OperatorType, typename TensorType,typename OperatorInfo> +std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input,arm_compute::ITensor *weights,arm_compute::ITensor *biases, arm_compute::ITensor *output, const OperatorInfo &info) +{ + return instantiate_function<OperatorType, TensorType, OperatorInfo>(input,weights,biases,output, info); +} + +template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor, typename OperatorInfo,typename ACLTensor> +std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input,std::unique_ptr<ACLTensor>& weights,std::unique_ptr<ACLTensor>& biases, std::unique_ptr<ACLTensor>& output, const OperatorInfo &info,TargetHint& hint){ + std::unique_ptr<arm_compute::IFunction> func; + arm_compute::ITensor * biases_tensor=NULL; + + if (biases.get()) { + biases_tensor=biases->tensor(); + } +#ifdef USE_OPENCL + if (hint == TargetHint::OPENCL) + { + func = instantiate<GPUOpType, GPUTensor,OperatorInfo>(input->tensor(), weights->tensor(),biases_tensor,output->tensor(), info); + } + else +#endif + { + func = instantiate<CPUOpType, CPUTensor,OperatorInfo>(input->tensor(), weights->tensor(),biases_tensor, output->tensor(), info); + } + return func; +} + + + +template <typename Dtype,typename OperatorType, typename TensorType> +std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output, + arm_compute::ITensor *mean,arm_compute::ITensor *var,arm_compute::ITensor *beta,arm_compute::ITensor *gamma,Dtype & eps){ + auto op = cpp14::make_unique<OperatorType>(); + op->configure( + dynamic_cast<TensorType *>(input), + dynamic_cast<TensorType *>(output), + dynamic_cast<TensorType *>(mean), + dynamic_cast<TensorType *>(var), + dynamic_cast<TensorType *>(beta), + dynamic_cast<TensorType *>(gamma), + eps); + + return std::move(op); +} + +template <typename Dtype,typename OperatorType, typename TensorType> +std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor * input,arm_compute::ITensor * output, + arm_compute::ITensor * mean,arm_compute::ITensor * var,arm_compute::ITensor * beta,arm_compute::ITensor * gamma,Dtype eps){ + return instantiate_function<Dtype,OperatorType, TensorType>(input,output, mean,var,beta,gamma,eps); +} + +template <typename Dtype,typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor> +std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input,std::unique_ptr<ACLTensor>& output, + std::unique_ptr<ACLTensor>& mean,std::unique_ptr<ACLTensor>& var,std::unique_ptr<ACLTensor>& beta,std::unique_ptr<ACLTensor>& gamma,Dtype eps,TargetHint hint){ + std::unique_ptr<arm_compute::IFunction> func; +#ifdef USE_OPENCL + if(hint == TargetHint::OPENCL) + { + func = instantiate<Dtype,GPUOpType, GPUTensor>(input->tensor(),output->tensor(), mean->tensor(),var->tensor(),beta->tensor(),gamma->tensor(),eps); + } + else +#endif + { + func = instantiate<Dtype,CPUOpType, CPUTensor>(input->tensor(),output->tensor(), mean->tensor(),var->tensor(),beta->tensor(),gamma->tensor(),eps); + } + return func; +} + + +template <typename OperatorInfo> +bool instantiate_op_pooling(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input, std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){ + func.push_back(instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor, arm_compute::NEPoolingLayer, arm_compute::ITensor, arm_compute::PoolingLayerInfo>(input, output, info, hint)); + return true; +} +template <typename OperatorInfo> +bool instantiate_op_activation(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){ + func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,arm_compute::ICLTensor,arm_compute::NEActivationLayer,arm_compute::ITensor, arm_compute::ActivationLayerInfo>(input, output, info, hint)); + return true; +} +template <typename OperatorInfo> +bool instantiate_op_lrn(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){ + func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,arm_compute::ICLTensor,arm_compute::NENormalizationLayer,arm_compute::ITensor, arm_compute::NormalizationLayerInfo>(input, output, info, hint)); + return true; +} +template <typename OperatorInfo> +bool instantiate_op_conv(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint,const OperatorInfo &info){ + std::unique_ptr<ACLTensor> & weights=acl_op->weights(); + std::unique_ptr<ACLTensor> & biases=acl_op->biases(); + ConvolutionMethodHint& conv_method=acl_op->getConvMethod(); + bool has_biases=biases.get()?true:false; + int& groups=acl_op->group(); + arm_compute::TensorShape input_shape=input->info().tensor_shape(); + arm_compute::TensorShape weights_shape=weights->info().tensor_shape(); + arm_compute::TensorShape biases_shape; + if (has_biases) { + biases_shape = biases->info().tensor_shape(); + } + arm_compute::TensorShape output_shape=output->info().tensor_shape(); + + if (groups==1) { + if (conv_method == ConvolutionMethodHint::GEMM) { + func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo>(acl_op->input(), acl_op->weights(), acl_op->biases(), acl_op->output(), info, hint)); + }else{ + func.push_back(instantiate_op_func<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo>(acl_op->input(), acl_op->weights(), acl_op->biases(), acl_op->output(), info, hint)); + } + return true; + } + + // Calculate sub-tensor splits + const int input_split = input_shape.z() / groups; + const int output_split = output_shape.z() / groups; + const int weights_split = weights_shape[3] / groups; + const int biases_split = biases_shape.x() / groups; + + // Calculate sub-tensor shapes + input_shape.set(2, input_split); + output_shape.set(2, output_split); + weights_shape.set(3, weights_split); + biases_shape.set(0, biases_split); + + for (auto i = 0; i < groups; ++i) { + // Calculate sub-tensors starting coordinates + arm_compute::Coordinates input_coord(0, 0, input_split * i); + arm_compute::Coordinates output_coord(0, 0, output_split * i); + arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i); + arm_compute::Coordinates biases_coord(biases_split * i); + + // Create sub-tensors for input, output, weights and bias + acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape, input_coord); + acl_op->new_tensor(acl_op->soutput(i),acl_op->output(),output_shape, output_coord); + acl_op->new_tensor(acl_op->sweights(i),acl_op->weights(), weights_shape, weights_coord); + if (has_biases) { + acl_op->new_tensor(acl_op->sbiases(i),acl_op->biases(), biases_shape, biases_coord); + } + + if (conv_method == ConvolutionMethodHint::GEMM) { + func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo,ACLSubTensor>(acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), acl_op->soutput(i), info, hint)); + }else{ + func.push_back(instantiate_op_func<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo,ACLSubTensor>(acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), acl_op->soutput(i), info, hint)); + } + } + return true; +} +template <typename OperatorInfo> +bool instantiate_op_lc(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){ + std::unique_ptr<ACLTensor> & weights=acl_op->weights(); + std::unique_ptr<ACLTensor> & biases=acl_op->biases(); + func.push_back(instantiate_op_func<arm_compute::CLLocallyConnectedLayer,arm_compute::ICLTensor,arm_compute::NELocallyConnectedLayer,arm_compute::ITensor, arm_compute::PadStrideInfo>(input, weights,biases,output,info, hint)); + return true; +} +template <typename OperatorInfo> +bool instantiate_op_fc(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, const OperatorInfo &info){ + std::unique_ptr<ACLTensor> & weights=acl_op->weights(); + std::unique_ptr<ACLTensor> & biases=acl_op->biases(); + func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,arm_compute::ICLTensor,arm_compute::NEFullyConnectedLayer,arm_compute::ITensor, bool>(input, weights,biases,output,info, hint)); + return true; +} +template <typename Dtype> +bool instantiate_op_bn(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint, Dtype eps){ + std::unique_ptr<ACLTensor> & mean=acl_op->mean(); + std::unique_ptr<ACLTensor> & var=acl_op->var(); + std::unique_ptr<ACLTensor> & beta=acl_op->beta(); + std::unique_ptr<ACLTensor> & gamma=acl_op->gamma(); + func.push_back(instantiate_op_func<Dtype,arm_compute::CLBatchNormalizationLayer,arm_compute::ICLTensor,arm_compute::NEBatchNormalizationLayer,arm_compute::ITensor>(input, output, mean,var,beta,gamma,eps, hint)); + return true; +} +inline bool instantiate_op_softmax(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint,void *data){ + func.push_back(instantiate_op_func<arm_compute::CLSoftmaxLayer,arm_compute::ICLTensor,arm_compute::NESoftmaxLayer,arm_compute::ITensor>(input, output, hint)); + return true; +} +inline bool instantiate_op_concat(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint hint,int num){ + func.push_back(instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,arm_compute::ICLTensor,arm_compute::NEDepthConcatenateLayer,arm_compute::ITensor>(acl_op, output, num,hint)); + return true; +} +template <typename Dtype> +Dtype* GetDataPtr(ACLOperator* op,Blob<Dtype>* const &blob,bool isconst=false){ + if (!isconst) { + if (op->getTargetHint() == TargetHint::NEON) { + return blob->mutable_cpu_data(); + } + return blob->mutable_gpu_data(); + } + if (op->getTargetHint()==TargetHint::NEON) { + return (Dtype*)blob->cpu_data(); + } + return (Dtype*)blob->gpu_data(); +} + +template <typename Dtype> +Dtype* InputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,int index=-1){ + if (index==-1) index=0; + return GetDataPtr(op, bottom[index], true); +} +template <typename Dtype> +Dtype* OutputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& top){ + return GetDataPtr(op,top[0]); +} + +template <typename Dtype> +void acl_run(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top,bool multi_input_run=true){ + if (multi_input_run) { + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* top_data = top[i]->mutable_cpu_data(); + op->acl_run((void*)bottom_data,(void*)top_data); + } + return ; + } + for (int i = 0; i < bottom.size(); ++i) { + op->tensor_mem(op->cinput(i),InputdataPtr(op,bottom,i)); + } + op->acl_run(NULL,OutputdataPtr(op,top)); +} +} + +#define INIT_GLOBAL_FUNCS_TYPE(Dtype) \ +template <> \ +Dtype* InputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,int index); \ +template <> \ +Dtype* OutputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& top); \ +template <> \ +Dtype* GetDataPtr(ACLOperator* op,Blob<Dtype>* const & blob,bool isconst); \ + +#define INIT_GLOBAL_FUNCS() \ +INIT_GLOBAL_FUNCS_TYPE(double); \ +INIT_GLOBAL_FUNCS_TYPE(float); \ + + +#ifdef USE_PROFILING +#define acl_configure(opname,acl_op,args...)\ +{\ + set_operator_property(operate_type_##opname,#opname); \ + logtime_util log_time(ACL_CONFIG_INFO); \ + instantiate_op_##opname(acl_op,acl_op->funcs(),acl_op->input(),acl_op->output(),acl_op->getTargetHint(),args);\ +} +#else +#define acl_configure(opname,acl_op,args...)\ +{\ + set_operator_property(operate_type_##opname,#opname); \ + instantiate_op_##opname(acl_op,acl_op->funcs(),acl_op->input(),acl_op->output(),acl_op->getTargetHint(),args);\ +} +#endif + +#endif + +#endif diff --git a/include/caffe/acl_tensor.hpp b/include/caffe/acl_tensor.hpp new file mode 100644 index 00000000..89466de3 --- /dev/null +++ b/include/caffe/acl_tensor.hpp @@ -0,0 +1,114 @@ +#ifndef __TENSOR_H__ +#define __TENSOR_H__ + +#ifdef USE_ACL +#include "arm_compute/runtime/CL/CLSubTensor.h" +#include "arm_compute/runtime/SubTensor.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace caffe{ +enum class TargetHint{ + DONT_CARE, + OPENCL, + NEON, +}; + +enum class ConvolutionMethodHint{ + GEMM, + DIRECT, +}; +namespace cpp14{ +template <class T> +struct _Unique_if{ + typedef std::unique_ptr<T> _Single_object; +}; + +template <class T> +struct _Unique_if<T[]>{ + typedef std::unique_ptr<T[]> _Unknown_bound; +}; + +template <class T, size_t N> +struct _Unique_if<T[N]>{ + typedef void _Known_bound; +}; + +template <class T, class... Args> +typename _Unique_if<T>::_Single_object +make_unique(Args &&... args){ + return std::unique_ptr<T>(new T(std::forward<Args>(args)...)); +} + +template <class T> +typename _Unique_if<T>::_Unknown_bound +make_unique(size_t n){ + typedef typename std::remove_extent<T>::type U; + return std::unique_ptr<T>(new U[n]()); +} + +template <class T, class... Args> +typename _Unique_if<T>::_Known_bound +make_unique(Args &&...) ; +} + +class Tensor { +public: + Tensor(arm_compute::TensorInfo &info) noexcept; + ~Tensor(){ + } + Tensor(Tensor &&src) noexcept ; + void set_info(arm_compute::TensorInfo &&info){ + _info = info; + } + arm_compute::ITensor *set_target(TargetHint target); + const arm_compute::TensorInfo &info() const{ + return _info; + } + arm_compute::ITensor * tensor(){ + return _tensor.get(); + } + void allocate(); + void init(){ + + } + TargetHint target() const{ + return _target; + } + virtual void map(bool blocking = true); + virtual void unmap(); + +private: + TargetHint _target; + arm_compute::TensorInfo _info; + std::unique_ptr<arm_compute::ITensor> _tensor; +}; + +class SubTensor +{ +public: + SubTensor(Tensor* parent, arm_compute::TensorShape& tensor_shape, arm_compute::Coordinates& coords)noexcept; + ~SubTensor(){} + arm_compute::ITensor *tensor() ; + const arm_compute::ITensor *tensor() const ; + TargetHint target() const ; + void allocate() ; + arm_compute::ITensor *set_target(TargetHint target); + +private: + /** Instantiates a sub-tensor */ + void instantiate_subtensor(); + +private: + TargetHint _target; /**< Target that this tensor is pinned on */ + arm_compute::TensorShape _tensor_shape; /**< SubTensor shape */ + arm_compute::Coordinates _coords; /**< SubTensor Coordinates */ + arm_compute::ITensor *_parent; /**< Parent tensor */ + std::unique_ptr<arm_compute::ITensor> _subtensor; /**< SubTensor */ +}; + +} +#endif +#endif //__TENSOR_H__ diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 49b1e695..4ffeb68c 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -22,6 +22,7 @@ extern unsigned int acl_log_flags; namespace boost { class mutex; } namespace caffe { +bool AclEnableSchedule(int enable=1); #ifdef USE_PROFILING class logtime_util { diff --git a/include/caffe/layers/acl_absval_layer.hpp b/include/caffe/layers/acl_absval_layer.hpp index c1655404..9eba67de 100644 --- a/include/caffe/layers/acl_absval_layer.hpp +++ b/include/caffe/layers/acl_absval_layer.hpp @@ -11,7 +11,7 @@ #include "caffe/layers/absval_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #include "caffe/layers/acl_base_activation_layer.hpp" #endif @@ -46,9 +46,10 @@ class ACLAbsValLayer : public ACLBaseActivationLayer<Dtype>,public AbsValLayer<D const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_base_activation_layer.hpp b/include/caffe/layers/acl_base_activation_layer.hpp index e2abdafa..39643f33 100644 --- a/include/caffe/layers/acl_base_activation_layer.hpp +++ b/include/caffe/layers/acl_base_activation_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/neuron_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #include "caffe/layers/acl_base_activation_layer.hpp" #endif @@ -23,9 +23,9 @@ namespace caffe { * Fallback to BNLLLayer for some corner cases. */ template <typename Dtype> -class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivationLayer> { +class ACLBaseActivationLayer : public ACLOperator { public: - explicit ACLBaseActivationLayer(const LayerParameter& param) + explicit ACLBaseActivationLayer(const LayerParameter& param):ACLOperator(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); @@ -46,8 +46,8 @@ class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivatio const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type=ActivationLayerInfo::ActivationFunction::RELU); + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type=arm_compute::ActivationLayerInfo::ActivationFunction::RELU); }; #endif diff --git a/include/caffe/layers/acl_base_conv_layer.hpp b/include/caffe/layers/acl_base_conv_layer.hpp deleted file mode 100644 index 6b38eb28..00000000 --- a/include/caffe/layers/acl_base_conv_layer.hpp +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef CAFFE_ACL_BASE_CONV_LAYER_HPP_ -#define CAFFE_ACL_BASE_CONV_LAYER_HPP_ - -#include <vector> - -#include "caffe/blob.hpp" -#include "caffe/layer.hpp" -#include "caffe/proto/caffe.pb.h" - -#include "caffe/layers/conv_layer.hpp" - -#ifdef USE_ACL -#include "caffe/acl_layer.hpp" -#endif - -namespace caffe { - -#ifdef USE_ACL -/* - * @brief ACL implementation of ConvolutionLayer. - * Fallback to ConvolutionLayer for some corner cases. - * -*/ -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -class ACLConvolutionLayer : public ACLBaseLayer<GPUConvLayer,CPUConvLayer>,public ConvolutionLayer<Dtype> { - public: - explicit ACLConvolutionLayer(const LayerParameter& param) - : ConvolutionLayer<Dtype>(param) {} - virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top); - virtual void Reshape(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top); - virtual ~ACLConvolutionLayer(); - - protected: - virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top); - virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top); - virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, - const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ - NOT_IMPLEMENTED; - } - virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, - const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ - NOT_IMPLEMENTED; - } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top); - -}; -#endif - -} // namespace caffe - -// Instantiate a class with float and double specifications. -#define INSTANTIATE_CONV_CLASS(classname,GPUConvLayer,CPUConvLayer) \ - template class classname<float,GPUConvLayer,CPUConvLayer>; \ - template class classname<double,GPUConvLayer,CPUConvLayer> - -#endif // CAFFE_ACL_BASE_CONV_LAYER_HPP_ diff --git a/include/caffe/layers/acl_batch_norm_layer.hpp b/include/caffe/layers/acl_batch_norm_layer.hpp index e899804f..97dcab3d 100644 --- a/include/caffe/layers/acl_batch_norm_layer.hpp +++ b/include/caffe/layers/acl_batch_norm_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/batch_norm_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -21,10 +21,10 @@ namespace caffe { * Fallback to BatchNormLayer for some corner cases. */ template <typename Dtype> -class ACLBatchNormLayer : public ACLBaseLayer<CLBatchNormalizationLayer,NEBatchNormalizationLayer>,public BatchNormLayer<Dtype> { +class ACLBatchNormLayer : public ACLOperator,public BatchNormLayer<Dtype> { public: explicit ACLBatchNormLayer(const LayerParameter& param) - : BatchNormLayer<Dtype>(param) {} + : ACLOperator(param),BatchNormLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -44,8 +44,9 @@ class ACLBatchNormLayer : public ACLBaseLayer<CLBatchNormalizationLayer,NEBatchN const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_bnll_layer.hpp b/include/caffe/layers/acl_bnll_layer.hpp index ea2f8a16..9cf607e1 100644 --- a/include/caffe/layers/acl_bnll_layer.hpp +++ b/include/caffe/layers/acl_bnll_layer.hpp @@ -11,7 +11,7 @@ #include "caffe/layers/bnll_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #include "caffe/layers/acl_base_activation_layer.hpp" #endif @@ -47,8 +47,9 @@ class ACLBNLLLayer : public ACLBaseActivationLayer<Dtype>,public BNLLLayer<Dtype const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_concat_layer.hpp b/include/caffe/layers/acl_concat_layer.hpp index 90212192..bc1c9173 100644 --- a/include/caffe/layers/acl_concat_layer.hpp +++ b/include/caffe/layers/acl_concat_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/concat_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -21,10 +21,10 @@ namespace caffe { * Fallback to ConcatLayer for some corner cases. */ template <typename Dtype> -class ACLConcatLayer : public ACLBaseLayer<CLDepthConcatenate,NEDepthConcatenate>,public ConcatLayer<Dtype> { +class ACLConcatLayer : public ACLOperator,public ConcatLayer<Dtype> { public: explicit ACLConcatLayer(const LayerParameter& param) - : ConcatLayer<Dtype>(param) {} + : ACLOperator(param),ConcatLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -44,11 +44,9 @@ class ACLConcatLayer : public ACLBaseLayer<CLDepthConcatenate,NEDepthConcatenate const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); - private: - std::vector<ITensor *> cpu_vectors; - std::vector<ICLTensor *> gpu_vectors; + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_conv_layer.hpp b/include/caffe/layers/acl_conv_layer.hpp index 2fd795c9..21ca6aa6 100644 --- a/include/caffe/layers/acl_conv_layer.hpp +++ b/include/caffe/layers/acl_conv_layer.hpp @@ -1,46 +1,56 @@ #ifndef CAFFE_ACL_CONV_LAYER_HPP_ #define CAFFE_ACL_CONV_LAYER_HPP_ +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/conv_layer.hpp" + #ifdef USE_ACL -#include "caffe/layers/acl_base_conv_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { -extern bool use_direct_conv_; #ifdef USE_ACL +/* + * @brief ACL implementation of ConvolutionLayer. + * Fallback to ConvolutionLayer for some corner cases. + * +*/ template <typename Dtype> -inline shared_ptr<Layer<Dtype> > GetACLConvolutionLayer( - const LayerParameter& param) { - ConvolutionParameter conv_param = param.convolution_param(); - const char* pDirectConv; - pDirectConv = getenv ("DIRECTCONV"); - if (pDirectConv){ - unsigned int bdirectconv; - sscanf(pDirectConv,"%i", &bdirectconv); - if(bdirectconv != use_direct_conv_){ - use_direct_conv_ = bdirectconv; - printf("DIRECTCONV<%s>\n", pDirectConv); - printf("DIRECTCONV: %x\n", use_direct_conv_); +class ACLConvolutionLayer : public ACLOperator,public ConvolutionLayer<Dtype> { + public: + explicit ACLConvolutionLayer(const LayerParameter& param) + : ACLOperator(param),ConvolutionLayer<Dtype>(param) { + } + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLConvolutionLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; } - } - int pad_data[3]; - if (conv_param.has_pad_h() || conv_param.has_pad_w()) { - pad_data[0] = conv_param.pad_h(); - pad_data[1] = conv_param.pad_w(); - } else { - const int kDefaultPad = 0; - const int num_pad_dims = conv_param.pad_size(); - for (int i = 0; i < 2; ++i) { - pad_data[i] = (num_pad_dims == 0) ? kDefaultPad : - conv_param.pad((num_pad_dims == 1) ? 0 : i); + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; } - } - if (use_direct_conv_ && ( (conv_param.kernel_size(0)==1 &&pad_data[0]==0 && pad_data[1]==0) || (conv_param.kernel_size(0)==3 && pad_data[0]<=1 && pad_data[1] <=1 ) )) { - return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype, CLConvolutionLayer, NEDirectConvolutionLayer>(param)); //NEDirectConvolutionLayer only for 1x1 and 3x3 - } - return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype, CLConvolutionLayer, NEConvolutionLayer>(param)); -} + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); +}; + #endif } // namespace caffe diff --git a/include/caffe/layers/acl_inner_product_layer.hpp b/include/caffe/layers/acl_inner_product_layer.hpp index f42becb0..67a29914 100644 --- a/include/caffe/layers/acl_inner_product_layer.hpp +++ b/include/caffe/layers/acl_inner_product_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/inner_product_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -21,10 +21,11 @@ namespace caffe { * Fallback to InnerProductLayer for some corner cases. */ template <typename Dtype> -class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyConnectedLayer>,public InnerProductLayer<Dtype> { +class ACLInnerProductLayer : public ACLOperator,public InnerProductLayer<Dtype> { public: explicit ACLInnerProductLayer(const LayerParameter& param) - : InnerProductLayer<Dtype>(param) {} + : ACLOperator(param),InnerProductLayer<Dtype>(param) { + } virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -44,8 +45,9 @@ class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyCo const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_local_connect_layer.hpp b/include/caffe/layers/acl_local_connect_layer.hpp index fdb30757..b3b3a9e3 100644 --- a/include/caffe/layers/acl_local_connect_layer.hpp +++ b/include/caffe/layers/acl_local_connect_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/local_connect_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -22,10 +22,10 @@ namespace caffe { * */ template <typename Dtype> -class ACLLocalConnectLayer : public ACLBaseLayer<CLLocallyConnectedLayer,NELocallyConnectedLayer>,public LocalConnectLayer<Dtype> { +class ACLLocalConnectLayer : public ACLOperator,public LocalConnectLayer<Dtype> { public: explicit ACLLocalConnectLayer(const LayerParameter& param) - : LocalConnectLayer<Dtype>(param) {} + : ACLOperator(param),LocalConnectLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -45,9 +45,9 @@ class ACLLocalConnectLayer : public ACLBaseLayer<CLLocallyConnectedLayer,NELocal const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); - + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_lrn_layer.hpp b/include/caffe/layers/acl_lrn_layer.hpp index 6fd9fbc8..1a47f804 100644 --- a/include/caffe/layers/acl_lrn_layer.hpp +++ b/include/caffe/layers/acl_lrn_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/lrn_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -21,10 +21,10 @@ namespace caffe { * Fallback to LRNLayer for some corner cases. */ template <typename Dtype> -class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLayer>,public LRNLayer<Dtype> { +class ACLLRNLayer : public ACLOperator,public LRNLayer<Dtype> { public: explicit ACLLRNLayer(const LayerParameter& param) - : LRNLayer<Dtype>(param) {} + : ACLOperator(param),LRNLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -44,8 +44,9 @@ class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLaye const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_pooling_layer.hpp b/include/caffe/layers/acl_pooling_layer.hpp index acca35cf..b7f94497 100644 --- a/include/caffe/layers/acl_pooling_layer.hpp +++ b/include/caffe/layers/acl_pooling_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/pooling_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -21,10 +21,10 @@ namespace caffe { * Fallback to PoolingLayer for some corner cases. */ template <typename Dtype> -class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,public PoolingLayer<Dtype> { +class ACLPoolingLayer : public ACLOperator,public PoolingLayer<Dtype> { public: explicit ACLPoolingLayer(const LayerParameter& param) - : PoolingLayer<Dtype>(param) {} + : ACLOperator(param),PoolingLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -44,8 +44,9 @@ class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,publi const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_relu_layer.hpp b/include/caffe/layers/acl_relu_layer.hpp index 041dbecf..8bbb7267 100644 --- a/include/caffe/layers/acl_relu_layer.hpp +++ b/include/caffe/layers/acl_relu_layer.hpp @@ -11,7 +11,7 @@ #include "caffe/layers/relu_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #include "caffe/layers/acl_base_activation_layer.hpp" #endif @@ -46,8 +46,9 @@ class ACLReLULayer : public ACLBaseActivationLayer<Dtype>,public ReLULayer<Dtype const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_sigmoid_layer.hpp b/include/caffe/layers/acl_sigmoid_layer.hpp index 8638f73e..0e4bcbaf 100644 --- a/include/caffe/layers/acl_sigmoid_layer.hpp +++ b/include/caffe/layers/acl_sigmoid_layer.hpp @@ -11,7 +11,7 @@ #include "caffe/layers/sigmoid_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #include "caffe/layers/acl_base_activation_layer.hpp" #endif @@ -45,8 +45,9 @@ class ACLSigmoidLayer : public ACLBaseActivationLayer<Dtype>,public SigmoidLayer const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_softmax_layer.hpp b/include/caffe/layers/acl_softmax_layer.hpp index 9e450f5d..7e69992c 100644 --- a/include/caffe/layers/acl_softmax_layer.hpp +++ b/include/caffe/layers/acl_softmax_layer.hpp @@ -10,7 +10,7 @@ #include "caffe/layers/softmax_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #endif namespace caffe { @@ -21,10 +21,10 @@ namespace caffe { * Fallback to SoftmaxLayer for some corner cases. */ template <typename Dtype> -class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,public SoftmaxLayer<Dtype> { +class ACLSoftmaxLayer : public ACLOperator,public SoftmaxLayer<Dtype> { public: explicit ACLSoftmaxLayer(const LayerParameter& param) - : SoftmaxLayer<Dtype>(param) {} + : ACLOperator(param),SoftmaxLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, @@ -44,8 +44,9 @@ class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,publi const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/include/caffe/layers/acl_tanh_layer.hpp b/include/caffe/layers/acl_tanh_layer.hpp index 5a74ce5a..717ef012 100644 --- a/include/caffe/layers/acl_tanh_layer.hpp +++ b/include/caffe/layers/acl_tanh_layer.hpp @@ -11,7 +11,7 @@ #include "caffe/layers/tanh_layer.hpp" #ifdef USE_ACL -#include "caffe/acl_layer.hpp" +#include "caffe/acl_operator.hpp" #include "caffe/layers/acl_base_activation_layer.hpp" #endif @@ -46,8 +46,9 @@ class ACLTanHLayer : public ACLBaseActivationLayer<Dtype>,public TanHLayer<Dtype const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ NOT_IMPLEMENTED; } - virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); + virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type); + virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top); }; #endif diff --git a/src/caffe/acl_layer.cpp b/src/caffe/acl_layer.cpp deleted file mode 100644 index 879b6701..00000000 --- a/src/caffe/acl_layer.cpp +++ /dev/null @@ -1,289 +0,0 @@ -#ifdef USE_ACL -#include "caffe/acl_layer.hpp" - -unsigned int bypass_acl_class_layer = (0 | \ - /*0xffffffff |*/ \ - /*FLAGS_ENABLE_ACL_FC |*/ \ - /*FLAGS_ENABLE_ACL_LRN |*/ \ - 0 ); - -#ifdef USE_PROFILING - -#include "arm_neon.h" - -unsigned int acl_log_flags = (0 | \ - MASK_LOG_APP_TIME | \ - /*MASK_LOG_ALLOCATE | */\ - /*MASK_LOG_ALLOCATE | */\ - /*MASK_LOG_RUN | */\ - /*MASK_LOG_CONFIG | */\ - /*MASK_LOG_COPY | */\ - MASK_LOG_ABSVAL | \ - MASK_LOG_BNLL | \ - MASK_LOG_CONV | \ - MASK_LOG_FC | \ - MASK_LOG_LRN | \ - MASK_LOG_POOLING | \ - MASK_LOG_RELU | \ - MASK_LOG_SIGMOID | \ - MASK_LOG_SOFTMAX | \ - MASK_LOG_TANH | \ - MASK_LOG_LC | \ - MASK_LOG_BN | \ - MASK_LOG_CONCAT | \ - 0); -#include <stdio.h> /* printf */ -#include <stdlib.h> /* getenv */ -#endif //USE_PROFILING - -namespace caffe { -template <typename GPULayer, typename CPULayer> -ACLBaseLayer<GPULayer,CPULayer>::ACLBaseLayer() - :init_layer_(true),force_bypass_acl_path_(false){ - const char* pBypassACL; - pBypassACL = getenv ("BYPASSACL"); - if (pBypassACL){ - unsigned int bacl; - sscanf(pBypassACL,"%i", &bacl); - if(bacl != bypass_acl_class_layer){ - bypass_acl_class_layer = bacl; - printf("BYPASSACL<%s>\n", pBypassACL); - printf("BYPASSACL: %x\n", bypass_acl_class_layer); - } - } -#ifdef USE_PROFILING - const char* pLogACL; - pLogACL = getenv("LOGACL"); - if (pLogACL){ - unsigned int alf; - sscanf(pLogACL,"%i", &alf); - if (alf != acl_log_flags){ - acl_log_flags = alf; - printf("LOGACL<%s>\n", pLogACL); - printf("LOGACL: %x\n", acl_log_flags); - } - } -#endif //USE_PROFILING -} -template <typename GPULayer, typename CPULayer> -void ACLBaseLayer<GPULayer,CPULayer>::gpu_run() { - gpu_.run(true); -} -template <typename GPULayer, typename CPULayer> -void ACLBaseLayer<GPULayer,CPULayer>::cpu_run() { - cpu_.run(false); -} - -template <typename GPULayer, typename CPULayer> -ACLBaseLayer<GPULayer,CPULayer>::~ACLBaseLayer(){ -} -template <typename GPULayer, typename CPULayer> -template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem,bool share) -{ - tensor=new ACLTensor(share); -#if 1 //F32 - tensor->allocator()->init(TensorInfo(shape, Format::F32)); -#else //F16 - tensor->allocator()->init(TensorInfo(shape, Format::F16)); -#endif - tensor->bindmem(mem,share); - return true; -} - -template <typename ACLTensor> -void BaseTensor<ACLTensor>::commit(TensorType type){ - settensortype(type); - if (!share_&&mem_) { - if (!allocate_){ -#ifdef USE_PROFILING - logtime_util log_time(ACL_ALLOCATE_INFO); -#endif //USE_PROFILING - ACLTensor::allocator()->allocate(); - allocate_=true; - } - if (type_!= tensor_output) { - tensor_copy(mem_); - } - mem_=nullptr; - } -} - -template <typename ACLTensor> -int BaseTensor<ACLTensor>::tensor_copy(void * mem,bool toTensor) -{ -#ifdef USE_PROFILING - logtime_util log_time(ACL_COPY_INFO); -#endif //USE_PROFILING - arm_compute::Window window; - ACLTensor* tensor=this; - window.use_tensor_dimensions(tensor->info()->tensor_shape(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element) - int width = tensor->info()->tensor_shape()[0]; //->dimension(0); //window.x().end() - window.x().start(); // + 1; - int height = tensor->info()->tensor_shape()[1]; //->dimension(1); //window.y().end() - window.y().start(); // + 1; - int deepth = tensor->info()->tensor_shape()[2]; - map(); - // Create an iterator: - arm_compute::Iterator it(tensor, window); - // Except it works for an arbitrary number of dimensions - if (toTensor) { //mem->tensor - arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) - { -#if 0 //F16 - if (tensor->info()->element_size() ==2) - { - for(int i = 0; i < width; i+= 4){ - auto pa = (float32x4_t*)((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4); - *(float16x4_t*)(((char*)it.ptr()) + i*2) = vcvt_f16_f32(*pa); - } - } - else{ -#endif - memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size()); -#if 0 //F16 - } -#endif - }, - it); - }else{ //tensor-->mem - arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) - { -#if 0 //F16 - if (tensor->info()->element_size() ==2) - { - for(int i = 0; i < width; i+= 4){ - auto pa = (float32x4_t*)(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4)); - *pa = vcvt_f32_f16(*(float16x4_t*)(((char*)it.ptr()) + i*2)); - } - } - else{ -#endif - memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size()); -#if 0 //F16 - } -#endif - }, - it); - } - unmap(); - - return 0; -} - -template <typename GPULayer, typename CPULayer> -template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share) -{ - tensor->bindmem(mem,share); - return true; -} - -template <typename GPULayer, typename CPULayer> -template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share) -{ - if (mem==tensor->buffer()) return true; - if (!share) { - tensor->tensor_copy(mem,false); - } - return true; -} - - -template <typename GPULayer, typename CPULayer> -bool ACLBaseLayer<GPULayer,CPULayer>::checkreshape(TensorShape shape,bool gpu, TensorType type) -{ - if (gpu) { - init_layer_ = gpu_.reshape(shape,type); - }else{ - init_layer_ = cpu_.reshape(shape,type); - } - return init_layer_; -} - -template <typename GPULayer, typename CPULayer> -GPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_gpulayer(){ - gpu_.layer= new GPULayer; - return gpu_.layer; -} -template <typename GPULayer, typename CPULayer> -CPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_cpulayer(){ - cpu_.layer= new CPULayer; - return cpu_.layer; -} -template <typename ACLLayer,typename ACLTensor> -bool ACLXPUBaseLayer<ACLLayer,ACLTensor>::reshape(TensorShape &shape,TensorType type) -{ - TensorShape _shape; - if (!layer) return true; -#ifdef USE_CONV_CACHE - if (tensor_input == type){ - _shape = input->info()->tensor_shape(); - if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { - return false; - } - for(int i = 0; i < 16; ++i){ - if(cache.input[i] == nullptr) break; - _shape = cache.input[i]->info()->tensor_shape(); - if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { - this->layer = cache.layer[i]; - this->input = cache.input[i]; - this->output = cache.output[i]; - this->weights = cache.weights[i]; - this->biases = cache.biases[i]; - return false; - } - } - } -#endif //USE_CONV_CACHE - switch (type) { - case tensor_biases: - _shape = biases->info()->tensor_shape(); - break; - case tensor_weights: - _shape = weights->info()->tensor_shape(); - break; - case tensor_output: - _shape = output->info()->tensor_shape(); - break; - case tensor_input: - default: - _shape = input->info()->tensor_shape(); - break; - } - if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { - return false; - } - freelayer(); - return true; -} - -INSTANTIATE_ACLBASECLASS(CLNormalizationLayer,NENormalizationLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLActivationLayer,NEActivationLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLPoolingLayer,NEPoolingLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLSoftmaxLayer,NESoftmaxLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLFullyConnectedLayer,NEFullyConnectedLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEConvolutionLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEDirectConvolutionLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEDirectConvolutionLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEDirectConvolutionLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLBatchNormalizationLayer,NEBatchNormalizationLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLBatchNormalizationLayer,NEBatchNormalizationLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLBatchNormalizationLayer,NEBatchNormalizationLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLLocallyConnectedLayer,NELocallyConnectedLayer); - INSTANTIATE_ACLBASE_FUNCTION(CLLocallyConnectedLayer,NELocallyConnectedLayer,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLLocallyConnectedLayer,NELocallyConnectedLayer,CPUTensor); -INSTANTIATE_ACLBASECLASS(CLDepthConcatenate,NEDepthConcatenate); - INSTANTIATE_ACLBASE_FUNCTION(CLDepthConcatenate,NEDepthConcatenate,GPUTensor); - INSTANTIATE_ACLBASE_FUNCTION(CLDepthConcatenate,NEDepthConcatenate,CPUTensor); -} - -#endif diff --git a/src/caffe/acl_operator.cpp b/src/caffe/acl_operator.cpp new file mode 100644 index 00000000..b35265a8 --- /dev/null +++ b/src/caffe/acl_operator.cpp @@ -0,0 +1,227 @@ +#ifdef USE_ACL +#include "caffe/acl_operator.hpp" +#include "caffe/common.hpp" + +unsigned int bypass_acl_class_layer = (0 | \ + FLAGS_ENABLE_ACL_CONCAT | \ + /*0xffffffff |*/ \ + /*FLAGS_ENABLE_ACL_FC |*/ \ + /*FLAGS_ENABLE_ACL_LRN |*/ \ + 0 ); + +unsigned int openailab_intfp = 0; +int enable_schedule=0; + +#ifdef USE_PROFILING + +#include "arm_neon.h" + +unsigned int acl_log_flags = (0 | \ + MASK_LOG_APP_TIME | \ + /*MASK_LOG_ALLOCATE | */\ + /*MASK_LOG_ALLOCATE | */\ + /*MASK_LOG_RUN | */\ + /*MASK_LOG_CONFIG | */\ + /*MASK_LOG_COPY | */\ + MASK_LOG_ABSVAL | \ + MASK_LOG_BNLL | \ + MASK_LOG_CONV | \ + MASK_LOG_FC | \ + MASK_LOG_LRN | \ + MASK_LOG_POOLING | \ + MASK_LOG_RELU | \ + MASK_LOG_SIGMOID | \ + MASK_LOG_SOFTMAX | \ + MASK_LOG_TANH | \ + MASK_LOG_LC | \ + MASK_LOG_BN | \ + MASK_LOG_CONCAT | \ + 0); +#include <stdio.h> /* printf */ +#include <stdlib.h> /* getenv */ +#endif //USE_PROFILING + +namespace caffe { +bool AclEnableSchedule(int enable){ + enable_schedule=enable; + if (enable) { + Caffe::set_mode(Caffe::GPU); + } + return true; +} +int isScheduleEnable() +{ + return enable_schedule; +} +bool ACLOperator::init_cl_env=true; +bool ACLOperator::support_opencl_=false; +bool opencl_is_available() +{ + return arm_compute::opencl_is_available(); +} +ACLOperator::ACLOperator(const LayerParameter& param) + :operator_state_(operator_not_init),force_bypass_acl_path_(false), + target_hint_(TargetHint::DONT_CARE), + convolution_method_hint_(ConvolutionMethodHint::GEMM), + _group(1),name_(""){ + const char* pBypassACL; + if(init_cl_env){ +#ifdef USE_OPENCL + try { + if (opencl_is_available()) { + arm_compute::CLScheduler::get().default_init(); + support_opencl_=true; + } + }catch(std::exception& e){ + support_opencl_=false; + } +#endif + init_cl_env=false; + } + pBypassACL = getenv ("BYPASSACL"); + if (pBypassACL){ + unsigned int bacl; + sscanf(pBypassACL,"%i", &bacl); + if(bacl != bypass_acl_class_layer){ + bypass_acl_class_layer = bacl; + printf("BYPASSACL<%s>\n", pBypassACL); + printf("BYPASSACL: %x\n", bypass_acl_class_layer); + } + } + + const string& layer_type = param.type(); + if (layer_type=="Convolution") { + ConvolutionParameter conv_param = param.convolution_param(); + const char* pDirectConv; + unsigned int use_direct_conv=0; + pDirectConv = getenv ("DIRECTCONV"); + if (pDirectConv){ + unsigned int bdirectconv; + sscanf(pDirectConv,"%i", &bdirectconv); + if(bdirectconv != use_direct_conv){ + use_direct_conv = bdirectconv; + printf("DIRECTCONV<%s>\n", pDirectConv); + printf("DIRECTCONV: %x\n", use_direct_conv); + } + } + int pad_data[3]; + if (conv_param.has_pad_h() || conv_param.has_pad_w()) { + pad_data[0] = conv_param.pad_h(); + pad_data[1] = conv_param.pad_w(); + } else { + const int kDefaultPad = 0; + const int num_pad_dims = conv_param.pad_size(); + for (int i = 0; i < 2; ++i) { + pad_data[i] = (num_pad_dims == 0) ? kDefaultPad : + conv_param.pad((num_pad_dims == 1) ? 0 : i); + } + } + if (use_direct_conv && ( (conv_param.kernel_size(0)==1 &&pad_data[0]==0 && pad_data[1]==0) || (conv_param.kernel_size(0)==3 && pad_data[0]<=1 && pad_data[1] <=1 ) )) { + convolution_method_hint_=ConvolutionMethodHint::DIRECT; //NEDirectConvolutionLayer only for 1x1 and 3x3 + } + } + +#ifdef USE_PROFILING + const char* pLogACL; + pLogACL = getenv("LOGACL"); + if (pLogACL){ + unsigned int alf; + sscanf(pLogACL,"%i", &alf); + if (alf != acl_log_flags){ + acl_log_flags = alf; + printf("LOGACL<%s>\n", pLogACL); + printf("LOGACL: %x\n", acl_log_flags); + } + } +#endif //USE_PROFILING + const char* pEnableSchedule; + pEnableSchedule = getenv ("ENABLESCHEDULE"); + if (pEnableSchedule){ + unsigned int bshedule; + sscanf(pEnableSchedule,"%i", &bshedule); + if(bshedule != enable_schedule){ + enable_schedule = bshedule; + printf("ENABLESCHEDULE<%s>\n", pEnableSchedule); + printf("ENABLESCHEDULE: %x\n", enable_schedule); + } + if (enable_schedule) { + AclEnableSchedule(1); + } + } +} +ACLOperator::~ACLOperator() { +} + +bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor> &tensor,arm_compute::TensorShape &shape,void *mem,bool commit) +{ + auto acl_tensor=new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32)); + acl_tensor->set_target(getTargetHint()); + acl_tensor->bindmem(mem); + if (commit) acl_tensor->commit(); + tensor=(std::unique_ptr<ACLTensor>) std::move(acl_tensor); + return true; +} +bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor> &tensor,std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord) +{ + auto acl_tensor=new ACLSubTensor(parent,shape, coord); + acl_tensor->set_target(getTargetHint()); + tensor=(std::unique_ptr<ACLSubTensor>) std::move(acl_tensor); + return true; +} + +void ACLTensor::commit(TensorType type) +{ + settensortype(type); + if (mem_) { + if (!allocate_){ +#ifdef USE_PROFILING + logtime_util log_time(ACL_ALLOCATE_INFO); +#endif //USE_PROFILING + allocate(); + allocate_=true; + } + if (type_!= tensor_output) { + tensor_copy(mem_); + } + mem_=nullptr; + } +} + +int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor,void * mem,bool toTensor) +{ +#ifdef USE_PROFILING + logtime_util log_time(ACL_COPY_INFO); +#endif //USE_PROFILING + arm_compute::Window window; + window.use_tensor_dimensions(tensor->info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element) + int width = tensor->info()->tensor_shape()[0]; + int height = tensor->info()->tensor_shape()[1]; + int deepth = tensor->info()->tensor_shape()[2]; + map(); + // Create an iterator: + arm_compute::Iterator it(tensor, window); + // Except it works for an arbitrary number of dimensions + if (toTensor) { //mem->tensor + arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) + { + memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size()); + }, + it); + }else{ //tensor-->mem + arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) + { + memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size()); + }, + it); + } + unmap(); + + return 0; +} + +INIT_GLOBAL_FUNCS(); + +} + + +#endif diff --git a/src/caffe/acl_tensor.cpp b/src/caffe/acl_tensor.cpp new file mode 100644 index 00000000..1ab82330 --- /dev/null +++ b/src/caffe/acl_tensor.cpp @@ -0,0 +1,138 @@ +#include "caffe/acl_tensor.hpp" + +namespace caffe { + +#ifdef USE_ACL +template <typename TensorType> +std::unique_ptr<arm_compute::ITensor> initialise_tensor(arm_compute::TensorInfo &info) +{ + auto tensor = cpp14::make_unique<TensorType>(); + tensor->allocator()->init(info); + return std::move(tensor); +} + +template <typename TensorType> +void tensor_allocate(arm_compute::ITensor &tensor) +{ + auto itensor = dynamic_cast<TensorType *>(&tensor); + itensor->allocator()->allocate(); +} + +Tensor::Tensor(arm_compute::TensorInfo &info) noexcept + : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) +{ +} + +Tensor::Tensor(Tensor &&src) noexcept + : _target(src._target), + _info(std::move(src._info)), + _tensor(std::move(src._tensor)) +{ +} + +arm_compute::ITensor *Tensor::set_target(TargetHint target) +{ + switch(target) + { +#ifdef USE_OPENCL + case TargetHint::OPENCL: + _tensor = initialise_tensor<arm_compute::CLTensor>(_info); + break; +#endif + case TargetHint::NEON: + _tensor = initialise_tensor<arm_compute::Tensor>(_info); + break; + default: + break; + } + _target = target; + return _tensor.get(); +} + +void Tensor::allocate() +{ + switch(_target) + { +#ifdef USE_OPENCL + case TargetHint::OPENCL: + tensor_allocate<arm_compute::CLTensor>(*_tensor); + break; +#endif + case TargetHint::NEON: + tensor_allocate<arm_compute::Tensor>(*_tensor); + break; + default: + break; + } +} +void Tensor::map(bool blocking){ +#ifdef USE_OPENCL + if (_target==TargetHint::OPENCL) + dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking); +#endif +} +void Tensor::unmap(){ +#ifdef USE_OPENCL + if (_target==TargetHint::OPENCL) + dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap(); +#endif +} + +template <typename SubTensorType, typename ParentTensorType> +std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, arm_compute::TensorShape shape, arm_compute::Coordinates coords) +{ + auto ptensor = dynamic_cast<ParentTensorType *>(parent); + auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords); + return std::move(subtensor); +} +SubTensor::SubTensor(Tensor* parent, arm_compute::TensorShape& tensor_shape, arm_compute::Coordinates& coords) noexcept + : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr) +{ + _parent = parent->tensor(); + _target = parent->target(); + + instantiate_subtensor(); +} +arm_compute::ITensor *SubTensor::set_target(TargetHint target) +{ + return (target == _target) ? _subtensor.get() : nullptr; +} + +arm_compute::ITensor *SubTensor::tensor() +{ + return _subtensor.get(); +} + +const arm_compute::ITensor *SubTensor::tensor() const +{ + return _subtensor.get(); +} + +TargetHint SubTensor::target() const +{ + return _target; +} + +void SubTensor::allocate() +{ + // NOP for sub-tensors +} + +void SubTensor::instantiate_subtensor() +{ + switch(_target) + { +#ifdef USE_OPENCL + case TargetHint::OPENCL: + _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords); + break; +#endif + default: + case TargetHint::NEON: + _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords); + break; + } +} + +#endif +} diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index f1db7dd4..dd800b04 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -7,10 +7,6 @@ #include "caffe/common.hpp" #include "caffe/util/rng.hpp" -#ifdef USE_ACL -#include "arm_compute/runtime/CL/CLScheduler.h" -using namespace arm_compute; -#endif namespace caffe { @@ -58,18 +54,8 @@ void GlobalInit(int* pargc, char*** pargv) { Caffe::Caffe() : random_generator_(), mode_(Caffe::CPU),use_mali_gpu_(false), - solver_count_(1), solver_rank_(0), multiprocess_(false) { -#ifdef USE_ACL - - try { - CLScheduler::get().default_init(); - } - catch(std::exception& e) + solver_count_(1), solver_rank_(0), multiprocess_(false) { - std::cout << "OPENCL initialization failed"<< std::endl; - } - -#endif } Caffe::~Caffe() { } diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index f9e2908c..5e1011e6 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -56,7 +56,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer( ConvolutionParameter conv_param = param.convolution_param(); ConvolutionParameter_Engine engine = conv_param.engine(); #ifdef USE_ACL - return GetACLConvolutionLayer<Dtype>(param); + return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype>(param)); #endif #ifdef USE_CUDNN bool use_dilation = false; diff --git a/src/caffe/layers/acl_absval_layer.cpp b/src/caffe/layers/acl_absval_layer.cpp index b0b0304d..deea89c7 100644 --- a/src/caffe/layers/acl_absval_layer.cpp +++ b/src/caffe/layers/acl_absval_layer.cpp @@ -15,9 +15,9 @@ void ACLAbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> -void ACLAbsValLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){ - ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::ABS); +void ACLAbsValLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::ABS); } template <typename Dtype> @@ -28,12 +28,21 @@ void ACLAbsValLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLAbsValLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLAbsValLayer<Dtype>::Forward_cpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #ifdef USE_PROFILING logtime_util log_time(ACL_ABSVAL_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { AbsValLayer<Dtype>::Forward_cpu(bottom,top); return; } @@ -46,7 +55,7 @@ void ACLAbsValLayer<Dtype>::Forward_gpu( #ifdef USE_PROFILING logtime_util log_time(ACL_ABSVAL_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { AbsValLayer<Dtype>::Forward_cpu(bottom,top); return; } diff --git a/src/caffe/layers/acl_base_activation_layer.cpp b/src/caffe/layers/acl_base_activation_layer.cpp index 1fcd682c..cac524fb 100644 --- a/src/caffe/layers/acl_base_activation_layer.cpp +++ b/src/caffe/layers/acl_base_activation_layer.cpp @@ -11,50 +11,25 @@ void ACLBaseActivationLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { } template <typename Dtype> -void ACLBaseActivationLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){ +void ACLBaseActivationLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){ const unsigned int count = bottom[0]->count(); const unsigned int count_ = top[0]->count(); - TensorShape input_shape(count); - TensorShape output_shape(count_); - checkreshape(input_shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - this->init_layer_=false; - // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } + arm_compute::TensorShape input_shape(count); + arm_compute::TensorShape output_shape(count_); + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); - this->force_bypass_acl_path_=false; - ActivationLayerInfo act_info(type); + // Initialize ACL. + arm_compute::ActivationLayerInfo act_info(type); - if(type== ActivationLayerInfo::ActivationFunction::TANH) - act_info=ActivationLayerInfo(type,1.0,1.0); - - + if(type== arm_compute::ActivationLayerInfo::ActivationFunction::TANH) + act_info=arm_compute::ActivationLayerInfo(type,1.0,1.0); - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - new_tensor(this->gpu().input,input_shape,(void*)bottom_data); - new_tensor(this->gpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().output,act_info); - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - new_tensor(this->cpu().input,input_shape,(void*)bottom_data); - new_tensor(this->cpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().output,act_info); - } + new_tensor(input(),input_shape,(void*)InputdataPtr(this,bottom)); + new_tensor(output(),output_shape,(void*)OutputdataPtr(this,top)); + acl_configure(activation,this,act_info); } template <typename Dtype> void ACLBaseActivationLayer<Dtype>::Reshape( @@ -64,27 +39,19 @@ void ACLBaseActivationLayer<Dtype>::Reshape( template <typename Dtype> void ACLBaseActivationLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ - Forward_gpu(bottom, top); + if(isGPUMode()){ + ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom, top); return; } - Dtype* top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - SetupACLLayer(bottom,top); - tensor_mem(this->cpu().input,(void*)(bottom_data)); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> void ACLBaseActivationLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - SetupACLLayer(bottom,top); - tensor_mem(this->gpu().input,(void*)(bottom_data)); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> diff --git a/src/caffe/layers/acl_base_conv_layer.cpp b/src/caffe/layers/acl_base_conv_layer.cpp deleted file mode 100644 index e3c58996..00000000 --- a/src/caffe/layers/acl_base_conv_layer.cpp +++ /dev/null @@ -1,222 +0,0 @@ -#ifdef USE_ACL -#include <algorithm> -#include <vector> - -#include "caffe/filler.hpp" -#include "caffe/layers/acl_conv_layer.hpp" - -namespace caffe { - -bool use_direct_conv_=false; -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::LayerSetUp( - const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); - this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV; -} - -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top){ - - TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::checkreshape(input_shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - this->init_layer_=false; - // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_gpulayer(); - }else{ - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_cpulayer(); - } - this->force_bypass_acl_path_=false; - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - int stride_x =this->stride_.mutable_cpu_data()[1]; - int stride_y =this->stride_.mutable_cpu_data()[0]; - int pad_x=this->pad_.mutable_cpu_data()[1]; - int pad_y=this->pad_.mutable_cpu_data()[0]; - unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1]; - unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0]; - PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y); - TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_); - TensorShape biases_shape ((unsigned int)this->num_output_); - TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); - - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - //[kernel_x, kernel_y, IFM, OFM] - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - //[OFM] - if (this->bias_term_) { - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data())); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data())); - } - - //[width, height, IFM] - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().input,input_shape,(void*)bottom_data); - //[width, height, OFM] - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - { - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info); -#ifdef USE_PROFILING - } -#endif //USE_PROFILING -#ifdef USE_CONV_CACHE - for(int i = 0; i < 16; ++i){ - fprintf(stderr, "<GPU>check cache[%d]\n", i); - if(this->gpu().cache.layer[i] == nullptr){ - this->gpu().cache.layer[i] = this->gpu().layer; - this->gpu().cache.input[i] = this->gpu().input; - this->gpu().cache.output[i] = this->gpu().output; - this->gpu().cache.weights[i] = this->gpu().weights; - this->gpu().cache.biases[i] = this->gpu().biases; - break; - } - } -#endif //USE_CONV_CACHE - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - //[kernel_x, kernel_y, IFM, OFM] - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - //[OFM] - if (this->bias_term_) { - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data())); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data())); - } - - //[width, height, IFM] - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().input,input_shape,(void*)bottom_data); - //[width, height, OFM] - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - { - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info); -#ifdef USE_PROFILING - } -#endif //USE_PROFILING -#ifdef USE_CONV_CACHE - for(int i = 0; i < 16; ++i){ - fprintf(stderr, "<CPU>check cache[%d]\n", i); - if(this->cpu().cache.layer[i] == nullptr){ - this->cpu().cache.layer[i] = this->cpu().layer; - this->cpu().cache.input[i] = this->cpu().input; - this->cpu().cache.output[i] = this->cpu().output; - this->cpu().cache.weights[i] = this->cpu().weights; - this->cpu().cache.biases[i] = this->cpu().biases; - break; - } - } -#endif //USE_CONV_CACHE - } -} -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Reshape( - const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - ConvolutionLayer<Dtype>::Reshape(bottom, top); -} - -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Forward_cpu( - const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ - Forward_gpu(bottom, top); - return; - } -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONV_INFO); -#endif //USE_PROFILING - if (this->force_bypass_acl_path_|| this->group_!=1) { - ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) { - ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - /* check dilation */ - int dilated=0; - - for(int i=0;i<this->num_spatial_axes_;i++) - { - const int *p=this->dilation_.cpu_data(); - - if(p[i]!=1) - dilated=1; - } - if(dilated) { - ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - - SetupACLLayer(bottom,top); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* top_data = top[i]->mutable_cpu_data(); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().input,(void*)bottom_data); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::cpu_run(); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem((void*)top_data,this->cpu().output); - } -} - -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Forward_gpu( - const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONV_INFO); -#endif //USE_PROFILING - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - if (this->force_bypass_acl_path_|| this->group_!=1) { - ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 ) { - ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - /* check dilation */ - int dilated=0; - - for(int i=0;i<this->num_spatial_axes_;i++) - { - const int *p=this->dilation_.gpu_data(); - - if(p[i]!=1) - dilated=1; - } - - if(dilated) { - ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - SetupACLLayer(bottom,top); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().input,(void*)bottom_data); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::gpu_run(); - ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem((void*)top_data,this->gpu().output); - } -} - -template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer> -ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::~ACLConvolutionLayer() { -} - -#ifdef USE_ACL -INSTANTIATE_CONV_CLASS(ACLConvolutionLayer,CLConvolutionLayer,NEDirectConvolutionLayer); -INSTANTIATE_CONV_CLASS(ACLConvolutionLayer,CLConvolutionLayer,NEConvolutionLayer); -#endif - -} // namespace caffe -#endif // USE_ACL diff --git a/src/caffe/layers/acl_batch_norm_layer.cpp b/src/caffe/layers/acl_batch_norm_layer.cpp index a6bc16d1..15df15cb 100644 --- a/src/caffe/layers/acl_batch_norm_layer.cpp +++ b/src/caffe/layers/acl_batch_norm_layer.cpp @@ -12,94 +12,43 @@ void ACLBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN; } template <typename Dtype> -void ACLBatchNormLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLBatchNormLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ - if (!this->init_layer_) return; - this->init_layer_=false; - // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } - - this->force_bypass_acl_path_=false; + arm_compute::TensorShape in_shape ((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); + if (is_operator_init_done(in_shape)) return; + set_operator_init_done(); - TensorShape in_shape ((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); - TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); - TensorShape mean_shape((unsigned int)this->channels_); - TensorShape var_shape=mean_shape; - TensorShape beta_shape=mean_shape; - TensorShape gamma_shape=mean_shape; + // Initialize ACL. + arm_compute::TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); + arm_compute::TensorShape mean_shape((unsigned int)this->channels_); + arm_compute::TensorShape var_shape=mean_shape; + arm_compute::TensorShape beta_shape=mean_shape; + arm_compute::TensorShape gamma_shape=mean_shape; Dtype beta_val[beta_shape.total_size()]; Dtype gamma_val[gamma_shape.total_size()]; - for (int i=0;i<beta_shape.total_size();++i) { beta_val[i]=0.0; } for (int i=0;i<gamma_shape.total_size();++i) { gamma_val[i]=1.0; } - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // use the stored mean/variance estimates. - const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? - 0 : 1 / this->blobs_[2]->cpu_data()[0]; - caffe_cpu_scale(this->variance_.count(), scale_factor, - this->blobs_[0]->gpu_data(), this->mean_.mutable_gpu_data()); - caffe_cpu_scale(this->variance_.count(), scale_factor, - this->blobs_[1]->gpu_data(), this->variance_.mutable_gpu_data()); - new_tensor(this->gpu().input,in_shape,(void*)bottom_data); - new_tensor(this->gpu().output,out_shape,(void*)top_data); - new_tensor(this->gpu().mean,mean_shape); - new_tensor(this->gpu().var,var_shape); - new_tensor(this->gpu().beta,beta_shape); - new_tensor(this->gpu().gamma,gamma_shape); - tensor_mem(this->gpu().mean,(void*)this->mean_.mutable_gpu_data()); - tensor_mem(this->gpu().var,(void*)this->variance_.mutable_gpu_data()); - tensor_mem(this->gpu().beta,(void*)beta_val); - tensor_mem(this->gpu().gamma,(void*)gamma_val); - this->gpu().mean->commit(); - this->gpu().var->commit(); - this->gpu().beta->commit(); - this->gpu().gamma->commit(); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().output,this->gpu().mean,this->gpu().var,this->gpu().beta,this->gpu().gamma,this->eps_); - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - // use the stored mean/variance estimates. - const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? - 0 : 1 / this->blobs_[2]->cpu_data()[0]; - caffe_cpu_scale(this->variance_.count(), scale_factor, - this->blobs_[0]->cpu_data(), this->mean_.mutable_cpu_data()); - caffe_cpu_scale(this->variance_.count(), scale_factor, - this->blobs_[1]->cpu_data(), this->variance_.mutable_cpu_data()); - new_tensor(this->cpu().input,in_shape,(void*)bottom_data); - new_tensor(this->cpu().output,out_shape,(void*)top_data); - new_tensor(this->cpu().mean,mean_shape); - new_tensor(this->cpu().var,var_shape); - new_tensor(this->cpu().beta,beta_shape); - new_tensor(this->cpu().gamma,gamma_shape); - tensor_mem(this->cpu().mean,(void*)this->mean_.mutable_cpu_data()); - tensor_mem(this->cpu().var,(void*)this->variance_.mutable_cpu_data()); - tensor_mem(this->cpu().beta,(void*)beta_val); - tensor_mem(this->cpu().gamma,(void*)gamma_val); - this->cpu().mean->commit(); - this->cpu().var->commit(); - this->cpu().beta->commit(); - this->cpu().gamma->commit(); + new_tensor(input(),in_shape,InputdataPtr(this,bottom)); + new_tensor(output(),out_shape,OutputdataPtr(this,top)); + // use the stored mean/variance estimates. + const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? + 0 : 1 / this->blobs_[2]->cpu_data()[0]; + caffe_cpu_scale(this->variance_.count(), scale_factor, + this->blobs_[0]->cpu_data(), GetDataPtr(this,&this->mean_)); + caffe_cpu_scale(this->variance_.count(), scale_factor, + this->blobs_[1]->cpu_data(), GetDataPtr(this,&this->variance_)); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().output,this->cpu().mean,this->cpu().var,this->cpu().beta,this->cpu().gamma,this->eps_); - } + new_tensor(mean(),mean_shape,GetDataPtr(this,&this->mean_)); + new_tensor(var(),var_shape,GetDataPtr(this,&this->variance_)); + new_tensor(beta(),beta_shape,(void*)beta_val,true); + new_tensor(gamma(),gamma_shape,(void*)gamma_val,true); + acl_configure(bn,this,this->eps_); } template <typename Dtype> void ACLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -109,43 +58,48 @@ void ACLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLBatchNormLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_||!this->use_global_stats_) { + bypass_acl=true; + } + if (isScheduleEnable()) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_BN_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_||!this->use_global_stats_) { + if (Bypass_acl(bottom,top)) { BatchNormLayer<Dtype>::Forward_cpu(bottom,top); return; } - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - SetupACLLayer(bottom,top); - tensor_mem(this->cpu().input,(void*)(bottom_data)); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> void ACLBatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if (this->force_bypass_acl_path_||!this->use_global_stats_) { - BatchNormLayer<Dtype>::Forward_cpu(bottom,top); - return; - } #ifdef USE_PROFILING logtime_util log_time(ACL_BN_INFO); #endif //USE_PROFILING - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - SetupACLLayer(bottom,top); - tensor_mem(this->gpu().input,(void*)(bottom_data)); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + if (Bypass_acl(bottom,top)) { + BatchNormLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> diff --git a/src/caffe/layers/acl_bnll_layer.cpp b/src/caffe/layers/acl_bnll_layer.cpp index 86f09831..20903cda 100644 --- a/src/caffe/layers/acl_bnll_layer.cpp +++ b/src/caffe/layers/acl_bnll_layer.cpp @@ -14,9 +14,9 @@ void ACLBNLLLayer<Dtype>::LayerSetUp( this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BNLL; } template <typename Dtype> -void ACLBNLLLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){ - ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::SOFT_RELU); +void ACLBNLLLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU); } template <typename Dtype> void ACLBNLLLayer<Dtype>::Reshape( @@ -26,12 +26,21 @@ void ACLBNLLLayer<Dtype>::Reshape( } template <typename Dtype> +bool ACLBNLLLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLBNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #ifdef USE_PROFILING logtime_util log_time(ACL_BNLL_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { BNLLLayer<Dtype>::Forward_cpu(bottom,top); return; } @@ -44,7 +53,7 @@ void ACLBNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_BNLL_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { BNLLLayer<Dtype>::Forward_cpu(bottom,top); return; } diff --git a/src/caffe/layers/acl_concat_layer.cpp b/src/caffe/layers/acl_concat_layer.cpp index 57a14126..d849a9e0 100644 --- a/src/caffe/layers/acl_concat_layer.cpp +++ b/src/caffe/layers/acl_concat_layer.cpp @@ -9,90 +9,66 @@ template <typename Dtype> void ACLConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConcatLayer<Dtype>::LayerSetUp(bottom, top); - //this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT; - this->force_bypass_acl_path_= true; + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT; } template <typename Dtype> -void ACLConcatLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLConcatLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ unsigned int channels=0; for (int i = 0; i < bottom.size(); ++i) { channels+=bottom[i]->channels(); } - TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),channels); + arm_compute::TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),channels); + + if (is_operator_init_done(out_shape,tensor_output)) return; + set_operator_init_done(); - if (!this->init_layer_) return; - this->init_layer_=false; // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); + std::vector<arm_compute::TensorShape> shapes; + for (int i = 0; i < bottom.size(); ++i) { + arm_compute::TensorShape in_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[i]->channels()); + new_tensor(cinput(i),in_shape,InputdataPtr(this,bottom,i)); } + new_tensor(output(),out_shape,OutputdataPtr(this,top)); + acl_configure(concat,this,bottom.size()); - this->force_bypass_acl_path_=false; - - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - TensorShape vec_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[0]->channels()); - GPUTensor *vector; - new_tensor(vector,vec_shape,(void*)bottom_data); - tensor_mem(vector,(void*)bottom_data); - vector->commit(); - gpu_vectors.push_back(vector); - } - new_tensor(this->gpu().output,out_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(gpu_vectors,this->gpu().output); - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - TensorShape vec_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[0]->channels()); - CPUTensor *vector; - new_tensor(vector,vec_shape,(void*)bottom_data); - tensor_mem(vector,(void*)bottom_data); - vector->commit(); - cpu_vectors.push_back(vector); - } - new_tensor(this->cpu().output,out_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(cpu_vectors,this->cpu().output); - } } template <typename Dtype> void ACLConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - ConcatLayer<Dtype>::Reshape(bottom, top); + ConcatLayer<Dtype>::Reshape(bottom, top); +} +template <typename Dtype> +bool ACLConcatLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_||this->concat_axis_==0) { + bypass_acl=true; + } + if(isScheduleEnable()){ + bypass_acl=true; + } + return bypass_acl; } template <typename Dtype> void ACLConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_CONCAT_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_||this->concat_axis_==0) { + if (Bypass_acl(bottom,top)) { ConcatLayer<Dtype>::Forward_cpu(bottom,top); return; } - Dtype* top_data = top[0]->mutable_cpu_data(); - SetupACLLayer(bottom,top); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top,false); } template <typename Dtype> @@ -101,27 +77,16 @@ void ACLConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_CONCAT_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_||this->concat_axis_==0) { + if (Bypass_acl(bottom,top)) { ConcatLayer<Dtype>::Forward_cpu(bottom,top); return; } - Dtype* top_data = top[0]->mutable_gpu_data(); - SetupACLLayer(bottom,top); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top,false); } template <typename Dtype> ACLConcatLayer<Dtype>::~ACLConcatLayer() { - if(this->force_bypass_acl_path_)return; - for (int i =0; i < cpu_vectors.size(); i ++) { - delete cpu_vectors[i]; - } - for (int i =0; i < gpu_vectors.size(); i ++) { - delete gpu_vectors[i]; - } - cpu_vectors.erase(cpu_vectors.begin()); - gpu_vectors.erase(gpu_vectors.begin()); } INSTANTIATE_CLASS(ACLConcatLayer); diff --git a/src/caffe/layers/acl_conv_layer.cpp b/src/caffe/layers/acl_conv_layer.cpp new file mode 100644 index 00000000..5cc6fcd9 --- /dev/null +++ b/src/caffe/layers/acl_conv_layer.cpp @@ -0,0 +1,147 @@ +#ifdef USE_ACL +#include <algorithm> +#include <vector> +#include "caffe/filler.hpp" +#include "caffe/layers/acl_conv_layer.hpp" + +namespace caffe { + +bool use_direct_conv_=false; +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::LayerSetUp( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV; +} + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + + arm_compute::TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + + // Initialize ACL. + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + int stride_x =this->stride_.mutable_cpu_data()[1]; + int stride_y =this->stride_.mutable_cpu_data()[0]; + int pad_x=this->pad_.mutable_cpu_data()[1]; + int pad_y=this->pad_.mutable_cpu_data()[0]; + unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1]; + unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0]; + arm_compute::PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y); + arm_compute::TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_/this->group_, (unsigned int)this->num_output_); + arm_compute::TensorShape biases_shape ((unsigned int)this->num_output_); + arm_compute::TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); + group()=this->group_; + + //[kernel_x, kernel_y, IFM, OFM] + new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get())); + //[OFM] + if (this->bias_term_) { + new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get())); + } + + //[width, height, IFM] + new_tensor(input(),input_shape,InputdataPtr(this,bottom)); + //[width, height, OFM] + new_tensor(output(),output_shape,OutputdataPtr(this,top)); + + acl_configure(conv,this,conv_info); +} +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::Reshape( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + ConvolutionLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +bool ACLConvolutionLayer<Dtype>::Bypass_acl( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + bool bypass_acl=false; + if (this->force_bypass_acl_path_|| ((openailab_intfp==0) && (this->group_>=5)) //for performance, more groups impact GPU performance + || ((openailab_intfp != 0 && (top[0]->channels() / this->group_ == 1)))) { + bypass_acl=true; + } + + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) { + bypass_acl=true; + } + /* check dilation */ + int dilated=0; + + for(int i=0;i<this->num_spatial_axes_;i++) + { + const int *p=this->dilation_.cpu_data(); + + if(p[i]!=1) + dilated=1; + } + if(dilated) { + bypass_acl=true; + } + + + if((this->kernel_shape_.mutable_cpu_data()[1]==1||this->kernel_shape_.mutable_cpu_data()[0]==1) && + isScheduleEnable()){ + bypass_acl=true; + } + if((this->kernel_shape_.mutable_cpu_data()[1]==3||this->kernel_shape_.mutable_cpu_data()[0]==3) && + (bottom[0]->channels()<150) && isScheduleEnable()){ + bypass_acl=true; + } + + return bypass_acl; +} + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::Forward_cpu( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + if(isGPUMode()){ + Forward_gpu(bottom, top); + return; + } +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONV_INFO); +#endif //USE_PROFILING + + if (Bypass_acl(bottom,top)) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + + SetupACLOperator(bottom,top); + + // acl fp + if (openailab_intfp==0){ + caffe::acl_run(this,bottom,top); + } + return; +} + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::Forward_gpu( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONV_INFO); +#endif //USE_PROFILING + if (Bypass_acl(bottom,top)) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); +} + +template <typename Dtype> +ACLConvolutionLayer<Dtype>::~ACLConvolutionLayer() { +} + +#ifdef USE_ACL +INSTANTIATE_CLASS(ACLConvolutionLayer); +#endif + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_inner_product_layer.cpp b/src/caffe/layers/acl_inner_product_layer.cpp index 47d1011f..bb819fcc 100644 --- a/src/caffe/layers/acl_inner_product_layer.cpp +++ b/src/caffe/layers/acl_inner_product_layer.cpp @@ -1,6 +1,5 @@ #ifdef USE_ACL #include <vector> - #include "caffe/filler.hpp" #include "caffe/layers/acl_inner_product_layer.hpp" #include "caffe/util/math_functions.hpp" @@ -15,65 +14,30 @@ void ACLInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_FC; } template <typename Dtype> -void ACLInnerProductLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLInnerProductLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ - TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_); - TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_); - TensorShape biases_shape((unsigned int)this->N_); - TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_); - TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_); - checkreshape(input_shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - this->init_layer_=false; + arm_compute::TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_); + arm_compute::TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_); + arm_compute::TensorShape biases_shape((unsigned int)this->N_); + arm_compute::TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_); + arm_compute::TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_); + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } bool transpose = !this->layer_param_.inner_product_param().transpose(); - this->force_bypass_acl_path_ = false; - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - if (transpose) { - new_tensor(this->gpu().weights,weights_shape_t,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - }else{ - new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - } - tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - if (this->bias_term_) { - new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data())); - tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data())); - } - new_tensor(this->gpu().input,input_shape,(void*)bottom_data); - new_tensor(this->gpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,transpose); + if (transpose) { + new_tensor(weights(),weights_shape_t,GetDataPtr(this,this->blobs_[0].get())); }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - if (transpose) { - new_tensor(this->cpu().weights,weights_shape_t,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - }else{ - new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - } - tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - if (this->bias_term_) { - new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data())); - tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data())); - } - new_tensor(this->cpu().input,input_shape,(void*)bottom_data); - new_tensor(this->cpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,transpose); + new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get())); + } + if (this->bias_term_) { + new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get())); } + new_tensor(input(),input_shape,InputdataPtr(this,bottom)); + new_tensor(output(),output_shape,OutputdataPtr(this,top)); + acl_configure(fc,this,transpose); } template <typename Dtype> void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -82,25 +46,40 @@ void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLInnerProductLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_FC_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom, top)) { InnerProductLayer<Dtype>::Forward_cpu(bottom,top); return; } - Dtype* top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - SetupACLLayer(bottom,top); - tensor_mem(this->cpu().input,(void*)(bottom_data)); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + SetupACLOperator(bottom,top); + + if (this->M_ != 1 && openailab_intfp != 0){ + InnerProductLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + + // ACL FP + if(openailab_intfp == 0){ + caffe::acl_run(this,bottom,top); + } + return; } template <typename Dtype> @@ -109,16 +88,12 @@ void ACLInnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom #ifdef USE_PROFILING logtime_util log_time(ACL_FC_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom, top)) { InnerProductLayer<Dtype>::Forward_cpu(bottom,top); return; } - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - SetupACLLayer(bottom,top); - tensor_mem(this->gpu().input,(void*)(bottom_data)); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> diff --git a/src/caffe/layers/acl_local_connect_layer.cpp b/src/caffe/layers/acl_local_connect_layer.cpp index 4eed72f9..1846faf4 100644 --- a/src/caffe/layers/acl_local_connect_layer.cpp +++ b/src/caffe/layers/acl_local_connect_layer.cpp @@ -15,20 +15,14 @@ void ACLLocalConnectLayer<Dtype>::LayerSetUp( } template <typename Dtype> -void ACLLocalConnectLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLLocalConnectLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ - TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); - checkreshape(input_shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - this->init_layer_=false; - // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } - this->force_bypass_acl_path_=false; + arm_compute::TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); + if (is_operator_init_done(input_shape)) return; + set_operator_init_done(); + + // Initialize ACL. ConvolutionParameter conv_param = this->layer_param_.convolution_param(); int stride_x =this->stride_; int stride_y =this->stride_; @@ -36,73 +30,23 @@ void ACLLocalConnectLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bott int pad_y=this->pad_; unsigned int kernel_x=this->kernel_size_; unsigned int kernel_y=this->kernel_size_; - PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y); - TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_); - TensorShape biases_shape ((unsigned int)this->num_output_); - TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); - - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - //[kernel_x, kernel_y, IFM, OFM] - new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data())); - //[OFM] - if (this->bias_term_) { - new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data())); - tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data())); - } + arm_compute::PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y); + arm_compute::TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_); + arm_compute::TensorShape biases_shape ((unsigned int)this->num_output_); + arm_compute::TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); - //[width, height, IFM] - new_tensor(this->gpu().input,input_shape,(void*)bottom_data); - //[width, height, OFM] - new_tensor(this->gpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - { - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info); -#ifdef USE_PROFILING - } -#endif //USE_PROFILING -#ifdef USE_CONV_CACHE - for(int i = 0; i < 16; ++i){ - fprintf(stderr, "<GPU>check cache[%d]\n", i); - if(this->gpu().cache.layer[i] == nullptr){ - this->gpu().cache.layer[i] = this->gpu().layer; - this->gpu().cache.input[i] = this->gpu().input; - this->gpu().cache.output[i] = this->gpu().output; - this->gpu().cache.weights[i] = this->gpu().weights; - this->gpu().cache.biases[i] = this->gpu().biases; - break; - } - } -#endif //USE_CONV_CACHE - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - //[kernel_x, kernel_y, IFM, OFM] - new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data())); - //[OFM] - if (this->bias_term_) { - new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data())); - tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data())); - } - - //[width, height, IFM] - new_tensor(this->cpu().input,input_shape,(void*)bottom_data); - //[width, height, OFM] - new_tensor(this->cpu().output,output_shape,(void*)top_data); -#ifdef USE_PROFILING - { - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info); -#ifdef USE_PROFILING - } -#endif //USE_PROFILING + //[kernel_x, kernel_y, IFM, OFM] + new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get())); + //[OFM] + if (this->bias_term_) { + new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get())); } + + //[width, height, IFM] + new_tensor(input(),input_shape,InputdataPtr(this,bottom)); + //[width, height, OFM] + new_tensor(output(),output_shape,OutputdataPtr(this,top)); + acl_configure(lc,this,conv_info); } template <typename Dtype> void ACLLocalConnectLayer<Dtype>::Reshape( @@ -111,34 +55,37 @@ void ACLLocalConnectLayer<Dtype>::Reshape( } template <typename Dtype> +bool ACLLocalConnectLayer<Dtype>::Bypass_acl( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + if (conv_param.kernel_size_size()>2 ) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLLocalConnectLayer<Dtype>::Forward_cpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_LC_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { - LocalConnectLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - - ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - if (conv_param.kernel_size_size()>2 ) { + if (Bypass_acl(bottom,top)) { LocalConnectLayer<Dtype>::Forward_cpu(bottom,top); return; } - SetupACLLayer(bottom,top); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->cpu_data(); - Dtype* top_data = top[i]->mutable_cpu_data(); - tensor_mem(this->cpu().input,(void*)bottom_data); - cpu_run(); - tensor_mem((void*)top_data,this->cpu().output); - } + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> @@ -148,22 +95,12 @@ void ACLLocalConnectLayer<Dtype>::Forward_gpu( logtime_util log_time(ACL_LC_INFO); #endif //USE_PROFILING ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - if (this->force_bypass_acl_path_) { - LocalConnectLayer<Dtype>::Forward_cpu(bottom,top); - return; - } - if (conv_param.kernel_size_size()>2 ) { + if (Bypass_acl(bottom,top)) { LocalConnectLayer<Dtype>::Forward_cpu(bottom,top); return; } - SetupACLLayer(bottom,top); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - tensor_mem(this->gpu().input,(void*)bottom_data); - gpu_run(); - tensor_mem((void*)top_data,this->gpu().output); - } + SetupACLOperator(bottom,top); + caffe::acl_run(this,bottom,top); } template <typename Dtype> diff --git a/src/caffe/layers/acl_lrn_layer.cpp b/src/caffe/layers/acl_lrn_layer.cpp index db9630da..2a94e010 100644 --- a/src/caffe/layers/acl_lrn_layer.cpp +++ b/src/caffe/layers/acl_lrn_layer.cpp @@ -5,7 +5,7 @@ namespace caffe { -const NormType IN_MAP=(arm_compute::NormType)0; +const arm_compute::NormType IN_MAP=(arm_compute::NormType)0; template <typename Dtype> void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { @@ -13,46 +13,24 @@ void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN; } template <typename Dtype> -void ACLLRNLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLLRNLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ - TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_); - checkreshape(shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } + arm_compute::TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_); + if (is_operator_init_done(shape)) return; + set_operator_init_done(); - //this->force_bypass_acl_path_=false; - NormalizationLayerInfo *norm_info; + // Initialize ACL. + arm_compute::NormalizationLayerInfo norm_info(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_); if(this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) - norm_info=new NormalizationLayerInfo(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_); + norm_info=arm_compute::NormalizationLayerInfo(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_); else - norm_info=new NormalizationLayerInfo(NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_); + norm_info=arm_compute::NormalizationLayerInfo(arm_compute::NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_); + + new_tensor(input(),shape,InputdataPtr(this,bottom)); + new_tensor(output(),shape,OutputdataPtr(this,top)); + acl_configure(lrn,this,norm_info); - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - new_tensor(this->gpu().input,shape,(void*)bottom_data); - new_tensor(this->gpu().output,shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().output,*norm_info); - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - new_tensor(this->cpu().input,shape,(void*)bottom_data); - new_tensor(this->cpu().output,shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().output,*norm_info); - } - delete norm_info; } template <typename Dtype> void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -62,35 +40,41 @@ void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLLRNLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLLRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_LRN_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) { + if (Bypass_acl(bottom, top)) { LRNLayer<Dtype>::Forward_cpu(bottom,top); return; } const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); - SetupACLLayer(bottom,top); + SetupACLOperator(bottom,top); switch (this->layer_param_.lrn_param().norm_region()) { case LRNParameter_NormRegion_ACROSS_CHANNELS: for (int n = 0; n < this->num_; ++n) { - tensor_mem(this->cpu().input,(void*)(bottom_data+ bottom[0]->offset(n))); - cpu_run(); - tensor_mem((void*)(top_data + top[0]->offset(n)),this->cpu().output); + acl_run((void*)(bottom_data+ bottom[0]->offset(n)),(void*)(top_data + top[0]->offset(n))); } break; case LRNParameter_NormRegion_WITHIN_CHANNEL: for (int n = 0; n < bottom[0]->num(); ++n) { - tensor_mem(this->cpu().input,(void*)(bottom_data)); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + acl_run((void*)bottom_data,(void*)top_data); bottom_data += bottom[0]->offset(0, 1); top_data += top[0]->offset(0, 1); } @@ -106,26 +90,22 @@ void ACLLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_LRN_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom, top)) { LRNLayer<Dtype>::Forward_cpu(bottom,top); return; } const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - SetupACLLayer(bottom,top); + SetupACLOperator(bottom,top); switch (this->layer_param_.lrn_param().norm_region()) { case LRNParameter_NormRegion_ACROSS_CHANNELS: for (int n = 0; n < this->num_; ++n) { - tensor_mem(this->gpu().input,(void*)(bottom_data+ bottom[0]->offset(n))); - gpu_run(); - tensor_mem((void*)(top_data + top[0]->offset(n)),this->gpu().output); + acl_run((void*)(bottom_data+ bottom[0]->offset(n)),(void*)(top_data + top[0]->offset(n))); } break; case LRNParameter_NormRegion_WITHIN_CHANNEL: for (int n = 0; n < bottom[0]->num(); ++n) { - tensor_mem(this->gpu().input,(void*)(bottom_data)); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + acl_run((void*)bottom_data,(void*)top_data); bottom_data += bottom[0]->offset(0, 1); top_data += top[0]->offset(0, 1); } diff --git a/src/caffe/layers/acl_pooling_layer.cpp b/src/caffe/layers/acl_pooling_layer.cpp index f72b2235..f62fb5d4 100644 --- a/src/caffe/layers/acl_pooling_layer.cpp +++ b/src/caffe/layers/acl_pooling_layer.cpp @@ -12,48 +12,25 @@ void ACLPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING; } template <typename Dtype> -void ACLPoolingLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLPoolingLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ - TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_,(unsigned int)this->channels_); - TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_,(unsigned int)this->channels_); - checkreshape(in_shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - this->init_layer_=false; - // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } + arm_compute::TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_,(unsigned int)this->channels_); + arm_compute::TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_,(unsigned int)this->channels_); + if (is_operator_init_done(in_shape)) return; + set_operator_init_done(); - this->force_bypass_acl_path_=false; - PoolingLayerInfo *pool_info; + // Initialize ACL. + arm_compute::PoolingLayerInfo pool_info; if(this->layer_param_.pooling_param().pool()==PoolingParameter_PoolMethod_MAX) - pool_info=new PoolingLayerInfo(PoolingType::MAX, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL)); + pool_info=arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX, this->kernel_w_, arm_compute::PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,arm_compute::DimensionRoundingType::CEIL)); else - pool_info=new PoolingLayerInfo(PoolingType::AVG, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL)); + pool_info=arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG, this->kernel_w_, arm_compute::PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,arm_compute::DimensionRoundingType::CEIL)); + + new_tensor(input(),in_shape,InputdataPtr(this,bottom)); + new_tensor(output(),out_shape,OutputdataPtr(this,top)); + acl_configure(pooling,this,pool_info); - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - new_tensor(this->gpu().input,in_shape,(void*)bottom_data); - new_tensor(this->gpu().output,out_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().output,*pool_info); - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - new_tensor(this->cpu().input,in_shape,(void*)bottom_data); - new_tensor(this->cpu().output,out_shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().output,*pool_info); - } - delete pool_info; } template <typename Dtype> void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -63,39 +40,44 @@ void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLPoolingLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) { + bypass_acl=true; + } + if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && + this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) { + bypass_acl=true; + } + if (this->kernel_h_!=this->kernel_w_) { + bypass_acl=true; + } + if (this->kernel_h_!=2 && this->kernel_h_!=3) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_POOLING_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) { + if (Bypass_acl(bottom,top)) { PoolingLayer<Dtype>::Forward_cpu(bottom,top); return; } const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); - if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && - this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) { - PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return ; - } - if (this->kernel_h_!=this->kernel_w_ || top.size()>1) { - PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return ; - } - if (this->kernel_h_!=2 && this->kernel_h_!=3) { - PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return ; - } - SetupACLLayer(bottom,top); + SetupACLOperator(bottom,top); for (int n = 0; n < bottom[0]->num(); ++n) { - tensor_mem(this->cpu().input,(void*)(bottom_data)); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + acl_run((void*)bottom_data,(void*)top_data); bottom_data += bottom[0]->offset(1); top_data += top[0]->offset(1); } @@ -107,30 +89,15 @@ void ACLPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_POOLING_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) { + if (Bypass_acl(bottom,top)) { PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return; + return ; } const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && - this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) { - PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return ; - } - if (this->kernel_h_!=this->kernel_w_) { - PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return ; - } - if (this->kernel_h_!=2 && this->kernel_h_!=3) { - PoolingLayer<Dtype>::Forward_cpu(bottom,top); - return ; - } - SetupACLLayer(bottom,top); + SetupACLOperator(bottom,top); for (int n = 0; n < bottom[0]->num(); ++n) { - tensor_mem(this->gpu().input,(void*)(bottom_data)); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + acl_run((void*)bottom_data,(void*)top_data); bottom_data += bottom[0]->offset(1); top_data += top[0]->offset(1); } diff --git a/src/caffe/layers/acl_relu_layer.cpp b/src/caffe/layers/acl_relu_layer.cpp index 03194539..2b712dda 100644 --- a/src/caffe/layers/acl_relu_layer.cpp +++ b/src/caffe/layers/acl_relu_layer.cpp @@ -13,9 +13,9 @@ void ACLReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU; } template <typename Dtype> -void ACLReLULayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLReLULayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ - ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::RELU); + ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::RELU); } template <typename Dtype> void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -25,20 +25,32 @@ void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLReLULayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + // Fallback to standard Caffe for leaky ReLU. + if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) { + bypass_acl=true; + } + if (isScheduleEnable()) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #ifdef USE_PROFILING logtime_util log_time(ACL_RELU_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { ReLULayer<Dtype>::Forward_cpu(bottom,top); return; } - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) { - ReLULayer<Dtype>::Forward_cpu(bottom, top); - return; - } ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top); } @@ -48,12 +60,7 @@ void ACLReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_RELU_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { - ReLULayer<Dtype>::Forward_cpu(bottom,top); - return; - } - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) { + if (Bypass_acl(bottom,top)) { ReLULayer<Dtype>::Forward_cpu(bottom, top); return; } diff --git a/src/caffe/layers/acl_sigmoid_layer.cpp b/src/caffe/layers/acl_sigmoid_layer.cpp index eac15651..4b3f660c 100644 --- a/src/caffe/layers/acl_sigmoid_layer.cpp +++ b/src/caffe/layers/acl_sigmoid_layer.cpp @@ -14,9 +14,9 @@ void ACLSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> -void ACLSigmoidLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){ - ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::LOGISTIC); +void ACLSigmoidLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); } template <typename Dtype> void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -26,12 +26,21 @@ void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLSigmoidLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLSigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #ifdef USE_PROFILING logtime_util log_time(ACL_SIGMOID_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { SigmoidLayer<Dtype>::Forward_cpu(bottom,top); return; } @@ -44,7 +53,7 @@ void ACLSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_SIGMOID_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { SigmoidLayer<Dtype>::Forward_cpu(bottom,top); return; } diff --git a/src/caffe/layers/acl_softmax_layer.cpp b/src/caffe/layers/acl_softmax_layer.cpp index 1568d3aa..6d3cd93e 100644 --- a/src/caffe/layers/acl_softmax_layer.cpp +++ b/src/caffe/layers/acl_softmax_layer.cpp @@ -13,42 +13,19 @@ void ACLSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX; } template <typename Dtype> -void ACLSoftmaxLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, +void ACLSoftmaxLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){ unsigned int channels = bottom[0]->shape(this->softmax_axis_); - TensorShape shape(channels*this->inner_num_); - checkreshape(shape,Caffe::arm_gpu_mode()); - if (!this->init_layer_) return; - this->init_layer_=false; + arm_compute::TensorShape shape(channels*this->inner_num_); + if (is_operator_init_done(shape)) return; + set_operator_init_done(); // Initialize ACL. - if (Caffe::arm_gpu_mode()) { - new_gpulayer(); - }else{ - new_cpulayer(); - } + new_tensor(input(),shape,InputdataPtr(this,bottom)); + new_tensor(output(),shape,OutputdataPtr(this,top)); + acl_configure(softmax,this,NULL); - //this->force_bypass_acl_path_=false; - if (Caffe::arm_gpu_mode()) { - Dtype *top_data = top[0]->mutable_gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - new_tensor(this->gpu().input,shape,(void*)bottom_data); - new_tensor(this->gpu().output,shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->gpu().layer->configure(this->gpu().input,this->gpu().output); - }else{ - Dtype *top_data = top[0]->mutable_cpu_data(); - const Dtype* bottom_data = bottom[0]->cpu_data(); - new_tensor(this->cpu().input,shape,(void*)bottom_data); - new_tensor(this->cpu().output,shape,(void*)top_data); -#ifdef USE_PROFILING - logtime_util log_time(ACL_CONFIG_INFO); -#endif //USE_PROFILING - this->cpu().layer->configure(this->cpu().input,this->cpu().output); - } } template <typename Dtype> void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, @@ -57,29 +34,36 @@ void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLSoftmaxLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_ || this->inner_num_>1) { + bypass_acl=true; + } + return bypass_acl; +} + +template <typename Dtype> void ACLSoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { - if(Caffe::arm_gpu_mode()){ + if(isGPUMode()){ Forward_gpu(bottom, top); return; } #ifdef USE_PROFILING logtime_util log_time(ACL_SOFTMAX_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_ || this->inner_num_>1) { + if (Bypass_acl(bottom,top)) { SoftmaxLayer<Dtype>::Forward_cpu(bottom,top); return ; } const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); - SetupACLLayer(bottom,top); + SetupACLOperator(bottom,top); int channels = bottom[0]->shape(this->softmax_axis_); for (int i = 0; i < this->outer_num_; ++i) { - tensor_mem(this->cpu().input,(void*)(bottom_data)); - cpu_run(); - tensor_mem((void*)(top_data),this->cpu().output); + acl_run((void*)bottom_data,(void*)top_data); top_data += channels; bottom_data += channels; } @@ -91,17 +75,15 @@ void ACLSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_SOFTMAX_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_|| this->inner_num_>1) { + if (Bypass_acl(bottom,top)) { SoftmaxLayer<Dtype>::Forward_cpu(bottom,top); return; } const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - SetupACLLayer(bottom,top); + SetupACLOperator(bottom,top); for (int i = 0; i < this->outer_num_; ++i) { - tensor_mem(this->gpu().input,(void*)(bottom_data)); - gpu_run(); - tensor_mem((void*)(top_data),this->gpu().output); + acl_run((void*)bottom_data,(void*)top_data); top_data += this->inner_num_; bottom_data += this->inner_num_; } diff --git a/src/caffe/layers/acl_tanh_layer.cpp b/src/caffe/layers/acl_tanh_layer.cpp index a1bb632c..9c1066e1 100644 --- a/src/caffe/layers/acl_tanh_layer.cpp +++ b/src/caffe/layers/acl_tanh_layer.cpp @@ -14,9 +14,9 @@ void ACLTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> -void ACLTanHLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, - const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){ - ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::TANH); +void ACLTanHLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::TANH); } template <typename Dtype> @@ -27,12 +27,20 @@ void ACLTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, } template <typename Dtype> +bool ACLTanHLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){ + bool bypass_acl=false; + if (this->force_bypass_acl_path_) { + bypass_acl=true; + } + return bypass_acl; +} +template <typename Dtype> void ACLTanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #ifdef USE_PROFILING logtime_util log_time(ACL_TANH_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { TanHLayer<Dtype>::Forward_cpu(bottom,top); return; } @@ -45,7 +53,7 @@ void ACLTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, #ifdef USE_PROFILING logtime_util log_time(ACL_TANH_INFO); #endif //USE_PROFILING - if (this->force_bypass_acl_path_) { + if (Bypass_acl(bottom,top)) { TanHLayer<Dtype>::Forward_cpu(bottom,top); return; } diff --git a/tools/extra/tpi.py b/tools/extra/tpi.py new file mode 100644 index 00000000..f455ed9b --- /dev/null +++ b/tools/extra/tpi.py @@ -0,0 +1,213 @@ +import sys +import os +import re +import pdb +import xlwt + +help_ = ''' +Usage: + python tpi.py log.txt +''' + +#data_list= {} +data_list1= [] +data_list2= [] +cnt=0 +times=0.0 +table_val='' +name_list1= ['allocate','run','configure','tensor_copy','ACL_CONV','ACL_FC','ACL_LRN','ACL_POOLING','ACL_RELU','ACL_SOFTMAX'] + + + + +def getvalpairs(words): + name='' + val='' + for word in words: + if word=='': + continue + if name=='': + name=word + else: + val=word + break; + #print word, + #print '' + return (name,val) + +def addpairstolist(db,name,val,idx): + #pdb.set_trace() + #if idx in db: + # db[idx]['val'] += val + #else: + # db[idx] = {'val':val,'name':name} + + #pdb.set_trace() + + for i in db: + if i['name']==name: + i['val'] += val + return + db.append({'idx':idx,'val':val,'name':name}) + +def gettabnum(line): + start=line.find(':') + if start==-1: + start=0 + else: + start+=1 + #pdb.set_trace() + str=line[start:-1].lstrip(' ') + words=re.split('\t',str) + idx=0 + for word in words: + idx+=1 + if word=='': + continue + break + return idx + +def decodefile(logfile): + data_list=data_list1 + for line in open(logfile): + if line.find(':')==-1: + continue + #pdb.set_trace() + #print line, + idx=gettabnum(line) + words=re.split('\t|:| |\r|\n',line) + #print(words) + (name,val)=getvalpairs(words) + #print (name,float(val),eval(val)) + if name == 'used' and val == 'time': + data_list=data_list2 + try: + addpairstolist(data_list,name,float(val),idx) + except ValueError as e: + #print(line) + continue + +def printresult(db): + #for i in db: + # print i, db[i]['idx'],db[i]['val'] + #pdb.set_trace() + db.sort(key=lambda obj:obj.get('idx'), reverse=False) + tpi_start=0 + conv_str='ACL_CONV' + find_acl = 0 + name_index=0 + global trow + global tcol + for i in db: + if i['name']==conv_str: + tpi_start=i['idx'] + + tpi=0 + for i in db: + if i['idx']>=tpi_start: + tpi+=i['val'] + + start=len('ACL_') + + table_head='TPI'+'\t' + table_val='%.4f' % (tpi/times)+'\t' + + for i in db: + #print i + if i['idx']<tpi_start: + if i['name'].find('ACL_')==0: + table_head+=i['name'][start:]+'\t' + else: + table_head+=i['name']+'\t' + table_val+='%.4f' % (i['val']/times)+'\t' + + print(table_head) + print(table_val) + + table_head='TPI'+'\t' + table_val='%.4f' % (tpi/times)+'\t' + + for i in db: + if i['idx']>=tpi_start: + if i['name'].find('ACL_')==0: + #pdb.set_trace() + table_head+=i['name'][start:]+'\t' + else: + table_head+=i['name']+'\t' + table_val+='%.4f' % (i['val']/times)+'\t' + + print(table_head) + print(table_val) + + ws.write(trow, tcol, 'TPI') + ws.write(trow+1,tcol,'%.4f' % (tpi/times)) + tcol+=1 + + temp_row=trow + temp_col=tcol + for i in name_list1: + if i.find('ACL_')==0 and find_acl==0: + temp_row+=2 + temp_col=2 + find_acl=1 + ws.write(temp_row,temp_col,i) + ws.write(temp_row+1,temp_col,'0') + temp_col+=1 + find_acl=0 + + for i in db: + curname=i['name'] + curvalue='%.4f' % (i['val']/times) + if curname == 'ACL_BN': + ws.write(trow+2,7,curname) + ws.write(trow+3,7,curvalue) + + if curname in name_list1: + val_col=name_list1.index(curname)+2 + val_row=trow + # print ('name found'+ curname + curvalue) + # print(val_col) + # print (val_row) + if val_col>5: + val_col-=4 + val_row+=2 + ws.write(val_row,val_col,curname) + ws.write(val_row+1,val_col,curvalue) + + tcol=0 + trow+=4 + + +if __name__ == '__main__' : + if len(sys.argv) < 2: + print(help_) + sys.exit() + else: + logfile = sys.argv[1] + + filename = os.path.basename(logfile) + decodefile(logfile) + + wb = xlwt.Workbook() + ws = wb.add_sheet('testsheet',True) + trow = 0 + tcol = 0 + cnt=1 + times=1.0 + table_val='' + print('1st time:') + ws.write(trow,tcol,'1st time') + tcol+=1 + printresult(data_list1) + + cnt=2 + times=10.0 + table_val='' + print('\nAverage of 2-11 times:') + ws.write(trow, tcol, '2-11 times') + tcol+=1 + printresult(data_list2) + wb.save(filename+'.xls') + print ('Xls file generated') + + |