diff options
72 files changed, 8287 insertions, 10 deletions
@@ -7,6 +7,12 @@ $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example.) endif include $(CONFIG_FILE) +ifeq ($(CPU_ONLY),1) + USE_CUDA := 0 +endif +ifeq ($(USE_ACL),1) + USE_CUDA := 0 +endif BUILD_DIR_LINK := $(BUILD_DIR) ifeq ($(RELEASE_BUILD_DIR),) RELEASE_BUILD_DIR := .$(BUILD_DIR)_release @@ -172,13 +178,13 @@ endif CUDA_LIB_DIR += $(CUDA_DIR)/lib INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include -ifneq ($(CPU_ONLY), 1) +ifeq ($(USE_CUDA), 1) INCLUDE_DIRS += $(CUDA_INCLUDE_DIR) LIBRARY_DIRS += $(CUDA_LIB_DIR) LIBRARIES := cudart cublas curand endif -LIBRARIES += glog gflags protobuf boost_system boost_filesystem m hdf5_hl hdf5 +LIBRARIES += glog gflags protobuf boost_system boost_filesystem m # handle IO dependencies USE_LEVELDB ?= 1 @@ -271,7 +277,7 @@ endif # libstdc++ for NVCC compatibility on OS X >= 10.9 with CUDA < 7.0 ifeq ($(OSX), 1) CXX := /usr/bin/clang++ - ifneq ($(CPU_ONLY), 1) + ifeq ($(USE_CUDA), 1) CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | tr -d '[a-z ]') ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1) CXXFLAGS += -stdlib=libstdc++ @@ -296,6 +302,10 @@ ifeq ($(OSX), 1) ORIGIN := @loader_path VERSIONFLAGS += -Wl,-install_name,@rpath/$(DYNAMIC_VERSIONED_NAME_SHORT) -Wl,-rpath,$(ORIGIN)/../../build/lib else + ifeq (${USE_OPENMP}, 1) + CXXFLAGS += -fopenmp + LINKFLAGS += -fopenmp + endif ORIGIN := \$$ORIGIN endif @@ -334,6 +344,27 @@ ifeq ($(USE_NCCL), 1) COMMON_FLAGS += -DUSE_NCCL endif +# ACL acceleration configuration +ifeq ($(USE_ACL), 1) + LIBRARY_DIRS += $(ACL_LIBS_DIR) + LIBRARIES += $(ACL_LIBS) + INCLUDE_DIRS +=$(ACL_INCS) + COMMON_FLAGS += -DUSE_ACL -std=c++11 +endif + +#USE_PROFILING -- get profiling informations, is controled by LOGACL +#LAYER_PERF_STAT -- haitao's net profiling information +ifeq ($(USE_PROFILING), 1) + COMMON_FLAGS += -DUSE_PROFILING -DLAYER_PERF_STAT +endif +#HDF5 +ifeq ($(USE_HDF5), 1) + LIBRARY_DIRS += $(HDF5_LIBRARY_DIRS) + LIBRARIES += $(HDF5_LIBRARIES) + INCLUDE_DIRS +=$(HDF5_INCLUDE_DIRS) + COMMON_FLAGS += -DUSE_HDF5 +endif + # configure IO libraries ifeq ($(USE_OPENCV), 1) COMMON_FLAGS += -DUSE_OPENCV @@ -358,6 +389,15 @@ ifeq ($(CPU_ONLY), 1) COMMON_FLAGS += -DCPU_ONLY endif +ifeq ($(USE_ACL), 1) + OBJS := $(PROTO_OBJS) $(CXX_OBJS) + TEST_OBJS := $(TEST_CXX_OBJS) + TEST_BINS := $(TEST_CXX_BINS) + ALL_WARNS := $(ALL_CXX_WARNS) + TEST_FILTER := --gtest_filter="-*GPU*" + COMMON_FLAGS += -DCPU_ONLY +endif + # Python layer support ifeq ($(WITH_PYTHON_LAYER), 1) COMMON_FLAGS += -DWITH_PYTHON_LAYER @@ -365,7 +405,8 @@ ifeq ($(WITH_PYTHON_LAYER), 1) endif # BLAS configuration (default = ATLAS) -BLAS ?= atlas +#BLAS ?= atlas +BLAS ?= open ifeq ($(BLAS), mkl) # MKL LIBRARIES += mkl_rt diff --git a/Makefile.config.acl b/Makefile.config.acl new file mode 100644 index 00000000..b30759fb --- /dev/null +++ b/Makefile.config.acl @@ -0,0 +1,140 @@ +## Refer to http://caffe.berkeleyvision.org/installation.html +# Contributions simplifying and improving our build system are welcome! + +# cuDNN acceleration switch (uncomment to build with cuDNN). +# USE_CUDNN := 1 + +# CPU-only switch (uncomment to build without GPU support). +CPU_ONLY := 1 + +USE_PROFILING := 0 + +USE_ACL :=1 +ACL_ROOT :=/home/firefly/ComputeLibrary +ACL_INCS :=$(ACL_ROOT)/include +ACL_INCS +=$(ACL_ROOT) +ACL_LIBS_DIR :=$(ACL_ROOT)/build +ACL_LIBS_DIR +=$(ACL_ROOT)/build/arm_compute +ACL_LIBS :=arm_compute OpenCL + +# uncomment to disable IO dependencies and corresponding data layers +# USE_OPENCV := 0 +# USE_LEVELDB := 0 +# USE_LMDB := 0 + +# uncomment to allow MDB_NOLOCK when reading LMDB files (only if necessary) +# You should not set this flag if you will be reading LMDBs with any +# possibility of simultaneous read and write +# ALLOW_LMDB_NOLOCK := 1 + +# Uncomment if you're using OpenCV 3 +# OPENCV_VERSION := 3 + +# To customize your choice of compiler, uncomment and set the following. +# N.B. the default for Linux is g++ and the default for OSX is clang++ +# CUSTOM_CXX := g++ +#CUSTOM_CXX := aarch64-linux-gnu-g++ +#os :=linux +#arch :=arm64-v8a + +# CUDA directory contains bin/ and lib/ directories that we need. +CUDA_DIR := /usr/local/cuda +# On Ubuntu 14.04, if cuda tools are installed via +# "sudo apt-get install nvidia-cuda-toolkit" then use this instead: +# CUDA_DIR := /usr + +# CUDA architecture setting: going with all of them. +# For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility. +# For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility. +CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \ + -gencode arch=compute_20,code=sm_21 \ + -gencode arch=compute_30,code=sm_30 \ + -gencode arch=compute_35,code=sm_35 \ + -gencode arch=compute_50,code=sm_50 \ + -gencode arch=compute_52,code=sm_52 \ + -gencode arch=compute_60,code=sm_60 \ + -gencode arch=compute_61,code=sm_61 \ + -gencode arch=compute_61,code=compute_61 + +# BLAS choice: +# atlas for ATLAS (default) +# mkl for MKL +# open for OpenBlas +#BLAS := atlas +BLAS := open +# Custom (MKL/ATLAS/OpenBLAS) include and lib directories. +# Leave commented to accept the defaults for your choice of BLAS +# (which should work)! +# BLAS_INCLUDE := /path/to/your/blas +# BLAS_LIB := /path/to/your/blas + +# Homebrew puts openblas in a directory that is not on the standard search path +# BLAS_INCLUDE := $(shell brew --prefix openblas)/include +# BLAS_LIB := $(shell brew --prefix openblas)/lib + +# This is required only if you will compile the matlab interface. +# MATLAB directory should contain the mex binary in /bin. +# MATLAB_DIR := /usr/local +# MATLAB_DIR := /Applications/MATLAB_R2012b.app + +# NOTE: this is required only if you will compile the python interface. +# We need to be able to find Python.h and numpy/arrayobject.h. +PYTHON_INCLUDE := /usr/include/python2.7 \ + /usr/lib/python2.7/dist-packages/numpy/core/include +# Anaconda Python distribution is quite popular. Include path: +# Verify anaconda location, sometimes it's in root. +# ANACONDA_HOME := $(HOME)/anaconda +# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \ + # $(ANACONDA_HOME)/include/python2.7 \ + # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include + +# Uncomment to use Python 3 (default is Python 2) +# PYTHON_LIBRARIES := boost_python3 python3.5m +# PYTHON_INCLUDE := /usr/include/python3.5m \ +# /usr/lib/python3.5/dist-packages/numpy/core/include + +# We need to be able to find libpythonX.X.so or .dylib. +PYTHON_LIB := /usr/lib +# PYTHON_LIB := $(ANACONDA_HOME)/lib + +# Homebrew installs numpy in a non standard path (keg only) +# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include +# PYTHON_LIB += $(shell brew --prefix numpy)/lib + +# Uncomment to support layers written in Python (will link against Python libs) +# WITH_PYTHON_LAYER := 1 + +# Whatever else you find you need goes here. +INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include +LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib + +# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies +# INCLUDE_DIRS += $(shell brew --prefix)/include +# LIBRARY_DIRS += $(shell brew --prefix)/lib + +# NCCL acceleration switch (uncomment to build with NCCL) +# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0) +# USE_NCCL := 1 + +# Uncomment to use `pkg-config` to specify OpenCV library paths. +# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.) +# USE_PKG_CONFIG := 1 + +# N.B. both build and distribute dirs are cleared on `make clean` +BUILD_DIR := build +DISTRIBUTE_DIR := distribute + +#HDF5 +USE_HDF5 := 1 +HDF5_INCLUDE_DIRS := /usr/include/hdf5/serial +HDF5_LIBRARY_DIRS := /usr/lib/aarch64-linux-gnu/hdf5/serial +HDF5_LIBRARIES :=hdf5_hl hdf5 + +# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171 +# DEBUG := 1 + +# The ID of the GPU that 'make runtest' will use to run unit tests. +TEST_GPUID := 0 + +# enable pretty build (comment to see full commands) +Q ?= @ @@ -1,3 +1,14 @@ +# CaffeOnACL +[![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE) + +CaffeOnACL is a project to use ARM Compute Library (NEON+GPU) to speed up caffe and provide utilities to debug, profile and tune application performance. + +Check out the documents for the details like +- [release notes](https://github.com/OAID/caffeOnACL/tree/master/docs/caffeOnACL_release_notes_0_2_0.docx) +- [user guide](https://github.com/OAID/caffeOnACL/tree/master/docs/caffeOnACL_user_guide_0_2_0.docx) + + + # Caffe [![Build Status](https://travis-ci.org/BVLC/caffe.svg?branch=master)](https://travis-ci.org/BVLC/caffe) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 02c81525..888443d0 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -66,6 +66,21 @@ if(USE_LEVELDB) list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_LEVELDB) endif() +# ---[ ACL +if(USE_ACL) + find_package(ACL REQUIRED) + list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${ACL_INCLUDE}) + list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${ACL_INCLUDE}/include) + set(__list ${ACL_LIBRARIES}) + separate_arguments(__list) + list(REMOVE_DUPLICATES __list) + foreach(i ${__list}) + list(APPEND Caffe_LINKER_LIBS PUBLIC ${i}) + endforeach() + list(APPEND Caffe_COMPILE_OPTIONS PRIVATE -std=c++11) + list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_ACL) +endif() + # ---[ Snappy if(USE_LEVELDB) find_package(Snappy REQUIRED) diff --git a/cmake/Modules/FindACL.cmake b/cmake/Modules/FindACL.cmake new file mode 100644 index 00000000..d7df5aaa --- /dev/null +++ b/cmake/Modules/FindACL.cmake @@ -0,0 +1,37 @@ +set(ACL_INC_PATHS + /usr/include + /usr/local/include + /usr/local/acl + $ENV{ACL_DIR}/include + ) + +set(ACL_LIB_PATHS + /lib + /lib64 + /usr/lib + /usr/lib64 + /usr/local/lib + /usr/local/lib64 + /usr/local/acl/lib + /usr/local/acl/lib64 + $ENV{ACL_DIR}/lib + ) + +find_path(ACL_INCLUDE NAMES arm_compute PATHS ${ACL_INC_PATHS}) +find_library(ACL_LIBRARIES NAMES arm_compute-static PATHS ${ACL_LIB_PATHS}) +find_library(ACL_CORE_LIBRARIES NAMES arm_compute_core-static PATHS ${ACL_LIB_PATHS}) +SET(ACL_LIBRARIES "${ACL_CORE_LIBRARIES} ${ACL_LIBRARIES}") + +if(ACL_INCS) + SET(ACL_INCLUDE "${ACL_INCS}") + SET(ACL_LIBRARIES "${ACL_LIBS}") + SET(ACL_FOUND 1) +else () + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(ACL DEFAULT_MSG ACL_INCLUDE ACL_LIBRARIES) +endif () + +if (ACL_FOUND) + message(STATUS "Found ACL (include: ${ACL_INCLUDE}, library: ${ACL_LIBRARIES})") + mark_as_advanced(ACL_INCLUDE ACL_LIBRARIES) +endif () diff --git a/docs/caffeOnACL_release_notes_0_2_0.docx b/docs/caffeOnACL_release_notes_0_2_0.docx Binary files differnew file mode 100644 index 00000000..0c1c0141 --- /dev/null +++ b/docs/caffeOnACL_release_notes_0_2_0.docx diff --git a/docs/caffeOnACL_user_guide_0_2_0.docx b/docs/caffeOnACL_user_guide_0_2_0.docx Binary files differnew file mode 100644 index 00000000..a7ebf61d --- /dev/null +++ b/docs/caffeOnACL_user_guide_0_2_0.docx diff --git a/examples/cpp_classification/classification_profiling.cpp b/examples/cpp_classification/classification_profiling.cpp new file mode 100644 index 00000000..f5d5eaed --- /dev/null +++ b/examples/cpp_classification/classification_profiling.cpp @@ -0,0 +1,546 @@ +#include <caffe/caffe.hpp> +#ifdef USE_OPENCV +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#endif // USE_OPENCV +#include <algorithm> +#include <iosfwd> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#ifdef USE_PROFILING + +#include <iostream> + +#include <time.h> + +#define REPEAT_TEST + +unsigned long get_cur_time(void) +{ + struct timespec tm; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &tm); + + return (tm.tv_sec*1000+tm.tv_nsec/1000000); +} + +#endif //USE_PROFILING + +#ifdef USE_OPENCV +using namespace caffe; // NOLINT(build/namespaces) +using std::string; + +/* Pair (label, confidence) representing a prediction. */ +typedef std::pair<string, float> Prediction; + +class Classifier { + public: + Classifier(const string& model_file, + const string& trained_file, + const string& mean_file, + const string& label_file); + + std::vector<Prediction> Classify(const cv::Mat& img, int N = 5); + +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT + void dump_perf_stat(void); + void dump_single_layer_io(int idx, Layer<float> * p_layer); + void dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time); +#ifdef REPEAT_TEST + void collect_layer_stat(vector<vector<perf_stat> * > & all_stat); + void dump_all_stat(vector <vector<perf_stat>*>& all_stat); + void reset_layer_stat(); +#endif +#endif + +#endif //USE_PROFILING + + private: + void SetMean(const string& mean_file); + + std::vector<float> Predict(const cv::Mat& img); + + void WrapInputLayer(std::vector<cv::Mat>* input_channels); + + void Preprocess(const cv::Mat& img, + std::vector<cv::Mat>* input_channels); + + private: + shared_ptr<Net<float> > net_; + cv::Size input_geometry_; + int num_channels_; + cv::Mat mean_; + std::vector<string> labels_; +}; + +Classifier::Classifier(const string& model_file, + const string& trained_file, + const string& mean_file, + const string& label_file) { +#ifdef CPU_ONLY + Caffe::set_mode(Caffe::CPU); +#else + Caffe::set_mode(Caffe::GPU); +#endif + + /* Load the network. */ + net_.reset(new Net<float>(model_file, TEST)); + net_->CopyTrainedLayersFrom(trained_file); + + CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input."; + CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output."; + + Blob<float>* input_layer = net_->input_blobs()[0]; + num_channels_ = input_layer->channels(); + CHECK(num_channels_ == 3 || num_channels_ == 1) + << "Input layer should have 1 or 3 channels."; + input_geometry_ = cv::Size(input_layer->width(), input_layer->height()); + + /* Load the binaryproto mean file. */ + SetMean(mean_file); + + /* Load labels. */ + std::ifstream labels(label_file.c_str()); + CHECK(labels) << "Unable to open labels file " << label_file; + string line; + while (std::getline(labels, line)) + labels_.push_back(string(line)); + + Blob<float>* output_layer = net_->output_blobs()[0]; + CHECK_EQ(labels_.size(), output_layer->channels()) + << "Number of labels is different from the output layer dimension."; +} + +static bool PairCompare(const std::pair<float, int>& lhs, + const std::pair<float, int>& rhs) { + return lhs.first > rhs.first; +} + +/* Return the indices of the top N values of vector v. */ +static std::vector<int> Argmax(const std::vector<float>& v, int N) { + std::vector<std::pair<float, int> > pairs; + for (size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector<int> result; + for (int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + +/* Return the top N predictions. */ +std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) { + std::vector<float> output = Predict(img); + + N = std::min<int>(labels_.size(), N); + std::vector<int> maxN = Argmax(output, N); + std::vector<Prediction> predictions; + for (int i = 0; i < N; ++i) { + int idx = maxN[i]; + predictions.push_back(std::make_pair(labels_[idx], output[idx])); + } + + return predictions; +} + +/* Load the mean file in binaryproto format. */ +void Classifier::SetMean(const string& mean_file) { + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + + /* Convert from BlobProto to Blob<float> */ + Blob<float> mean_blob; + mean_blob.FromProto(blob_proto); + CHECK_EQ(mean_blob.channels(), num_channels_) + << "Number of channels of mean file doesn't match input layer."; + + /* The format of the mean file is planar 32-bit float BGR or grayscale. */ + std::vector<cv::Mat> channels; + float* data = mean_blob.mutable_cpu_data(); + for (int i = 0; i < num_channels_; ++i) { + /* Extract an individual channel. */ + cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data); + channels.push_back(channel); + data += mean_blob.height() * mean_blob.width(); + } + + /* Merge the separate channels into a single image. */ + cv::Mat mean; + cv::merge(channels, mean); + + /* Compute the global mean pixel value and create a mean image + * filled with this value. */ + cv::Scalar channel_mean = cv::mean(mean); + mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean); +} + +std::vector<float> Classifier::Predict(const cv::Mat& img) { + Blob<float>* input_layer = net_->input_blobs()[0]; + input_layer->Reshape(1, num_channels_, + input_geometry_.height, input_geometry_.width); + /* Forward dimension change to all layers. */ + net_->Reshape(); + + std::vector<cv::Mat> input_channels; + WrapInputLayer(&input_channels); + + Preprocess(img, &input_channels); + +#ifdef USE_PROFILING + unsigned long tstart=get_cur_time(); +#endif //USE_PROFILING + + net_->Forward(); + +#ifdef USE_PROFILING + + unsigned long tend=get_cur_time(); + + std::cout<<"used time: "<<tend-tstart<<std::endl; + +#ifdef LAYER_PERF_STAT + dump_perf_stat(); +#ifdef REPEAT_TEST + + reset_layer_stat(); + + vector<vector<perf_stat>* > all_stat; + int rep_number=10; + + for(int i=0;i<rep_number;i++) + { + net_->Forward(); + collect_layer_stat(all_stat); + reset_layer_stat(); + } + + //dump stats + dump_all_stat(all_stat); + + for(int i=0;i<all_stat.size();i++) + delete all_stat[i]; + +#endif //REPEAT_TEST +#endif //LAYER_PERF_STAT +#endif //USE_PROFILING + + /* Copy the output layer to a std::vector */ + Blob<float>* output_layer = net_->output_blobs()[0]; + const float* begin = output_layer->cpu_data(); + const float* end = begin + output_layer->channels(); + return std::vector<float>(begin, end); +} + +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT + +#ifdef REPEAT_TEST +void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat) +{ + vector<perf_stat > * p_stat; + perf_stat * p_time_stat; + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + + + p_stat=new vector<perf_stat>; + + for (int i =0;i< layers.size(); i++) { + p_time_stat=layers[i]->get_time_stat(); + p_stat->push_back(*p_time_stat); + + } + + all_stat.push_back(p_stat); +} + +void Classifier::reset_layer_stat(void) +{ + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + perf_stat * p_time_stat; + + for (int i =0;i< layers.size(); i++) { + p_time_stat=layers[i]->get_time_stat(); + + p_time_stat->count=0; + p_time_stat->total=0; + p_time_stat->used=p_time_stat->start=p_time_stat->end=0; + } +} + +void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat) +{ + + struct new_perf_stat { + perf_stat stat; + int idx; + }; + + vector<new_perf_stat > layer_stat; + perf_stat * p_stat; + + uint64_t total_time=0; + + layer_stat.resize(all_stat[0]->size()); + + for(int i=0;i<all_stat.size();i++) + { + for(int j=0;j<layer_stat.size();j++) + { + p_stat=&layer_stat[j].stat; + + p_stat->total+=(*all_stat[i])[j].total; + p_stat->count+=(*all_stat[i])[j].count; + total_time+=(*all_stat[i])[j].total; + } + } + + total_time=total_time/all_stat.size(); + + std::cout<<std::endl<<"----------------------------------"<<std::endl; + std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl; + std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl; + std::cout<<"Each layer stats: ..."<<std::endl; + + + for(int i=layer_stat.size()-1;i>=0;i--) + { + p_stat=&layer_stat[i].stat; + + layer_stat[i].idx=i; + + std::cout<<" "<<i<<": used time: "<<p_stat->total/all_stat.size(); + std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100; + std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl; + } + + std::cout<<std::endl; + + std::cout<<"time cost top 10 layers are: ..."<<std::endl; + + std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b) + { + if(a.stat.total>b.stat.total) + return true; + else + return false; + }); + + uint64_t top_total_time=0; + + for(int i=0; i<10; i++) + { + p_stat=&layer_stat[i].stat; + + std::cout<<" "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size(); + std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100; + std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl; + top_total_time+=p_stat->total; + } + + std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl; + + std::cout<<std::endl; +} + +#endif + +void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer) +{ + const LayerParameter& layer_param=p_layer->layer_param(); + + std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name(); + std::cout<<" type: "<<layer_param.type()<<std::endl; + + const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom; + + for(int i=0;i<layer_param.bottom_size(); i++) + { + std::cout<<"bottom "<<layer_param.bottom(i)<<": "; + + Blob<float> * p_blob=(*p_bottom_vec)[i]; + + for(int j=0;j<p_blob->num_axes();j++) + { + std::cout<<p_blob->shape(j)<<" "; + } + std::cout<<std::endl; + } + + const vector<Blob<float>*> *p_top_vec=p_layer->saved_top; + for(int i=0;i<layer_param.top_size(); i++) + { + std::cout<<"top "<<layer_param.top(i)<<": "; + Blob<float> * p_blob=(*p_top_vec)[i]; + + for(int j=0;j<p_blob->num_axes();j++) + { + std::cout<<p_blob->shape(j)<<" "; + } + std::cout<<std::endl; + } +} + +void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time) +{ + const LayerParameter& layer_param=p_layer->layer_param(); + perf_stat * p_time_stat; + + p_time_stat=p_layer->get_time_stat(); + + std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name(); + std::cout<<" type: "<<layer_param.type(); + std::cout<<" ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl; + + + std::cout<<"time stat: total: "<<p_time_stat->total<<" count: "<<p_time_stat->count; + if(p_time_stat->count) + { + std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count; + } + + std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end; + std::cout<<std::endl; + + +} + +void Classifier::dump_perf_stat(void) +{ + uint64_t total_net_time=0; + + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + + std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl; + + for (int i = layers.size() - 1; i >= 0; --i) { + dump_single_layer_io(i,layers[i].get()); + } + + + for (int i = layers.size() - 1; i >= 0; --i) { + + perf_stat * p_time_stat; + + p_time_stat=layers[i]->get_time_stat(); + + total_net_time+=p_time_stat->total; + + } + + std::cout<<"Time for each layer ... sum of all layers is : "; + std::cout<<total_net_time<<std::endl; + + for (int i = layers.size() - 1; i >= 0; --i) { + + dump_single_layer_perf(i,layers[i].get(),total_net_time); + } + +} + +#endif + +#endif //USE_PROFILING + +/* Wrap the input layer of the network in separate cv::Mat objects + * (one per channel). This way we save one memcpy operation and we + * don't need to rely on cudaMemcpy2D. The last preprocessing + * operation will write the separate channels directly to the input + * layer. */ +void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) { + Blob<float>* input_layer = net_->input_blobs()[0]; + + int width = input_layer->width(); + int height = input_layer->height(); + float* input_data = input_layer->mutable_cpu_data(); + for (int i = 0; i < input_layer->channels(); ++i) { + cv::Mat channel(height, width, CV_32FC1, input_data); + input_channels->push_back(channel); + input_data += width * height; + } +} + +void Classifier::Preprocess(const cv::Mat& img, + std::vector<cv::Mat>* input_channels) { + /* Convert the input image to the input image format of the network. */ + cv::Mat sample; + if (img.channels() == 3 && num_channels_ == 1) + cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY); + else if (img.channels() == 4 && num_channels_ == 1) + cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY); + else if (img.channels() == 4 && num_channels_ == 3) + cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR); + else if (img.channels() == 1 && num_channels_ == 3) + cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR); + else + sample = img; + + cv::Mat sample_resized; + if (sample.size() != input_geometry_) + cv::resize(sample, sample_resized, input_geometry_); + else + sample_resized = sample; + + cv::Mat sample_float; + if (num_channels_ == 3) + sample_resized.convertTo(sample_float, CV_32FC3); + else + sample_resized.convertTo(sample_float, CV_32FC1); + + cv::Mat sample_normalized; + cv::subtract(sample_float, mean_, sample_normalized); + + /* This operation will write the separate BGR planes directly to the + * input layer of the network because it is wrapped by the cv::Mat + * objects in input_channels. */ + cv::split(sample_normalized, *input_channels); + + CHECK(reinterpret_cast<float*>(input_channels->at(0).data) + == net_->input_blobs()[0]->cpu_data()) + << "Input channels are not wrapping the input layer of the network."; +} + +int main(int argc, char** argv) { + if (argc != 6) { + std::cerr << "Usage: " << argv[0] + << " deploy.prototxt network.caffemodel" + << " mean.binaryproto labels.txt img.jpg" << std::endl; + return 1; + } + + ::google::InitGoogleLogging(argv[0]); + + string model_file = argv[1]; + string trained_file = argv[2]; + string mean_file = argv[3]; + string label_file = argv[4]; + Classifier classifier(model_file, trained_file, mean_file, label_file); + + string file = argv[5]; + + std::cout << "---------- Prediction for " + << file << " ----------" << std::endl; + + cv::Mat img = cv::imread(file, -1); + CHECK(!img.empty()) << "Unable to decode image " << file; + std::vector<Prediction> predictions = classifier.Classify(img); + + /* Print the top N predictions. */ + for (size_t i = 0; i < predictions.size(); ++i) { + Prediction p = predictions[i]; + std::cout << std::fixed << std::setprecision(4) << p.second << " - \"" + << p.first << "\"" << std::endl; + } +} +#else +int main(int argc, char** argv) { + LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV."; +} +#endif // USE_OPENCV diff --git a/examples/cpp_classification/classification_profiling_gpu.cpp b/examples/cpp_classification/classification_profiling_gpu.cpp new file mode 100644 index 00000000..3c5e04ad --- /dev/null +++ b/examples/cpp_classification/classification_profiling_gpu.cpp @@ -0,0 +1,546 @@ +#include <caffe/caffe.hpp> +#ifdef USE_OPENCV +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#endif // USE_OPENCV +#include <algorithm> +#include <iosfwd> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#ifdef USE_PROFILING + +#include <iostream> + +#include <time.h> + +#define REPEAT_TEST + +unsigned long get_cur_time(void) +{ + struct timespec tm; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &tm); + + return (tm.tv_sec*1000+tm.tv_nsec/1000000); +} + +#endif //USE_PROFILING + +#ifdef USE_OPENCV +using namespace caffe; // NOLINT(build/namespaces) +using std::string; + +/* Pair (label, confidence) representing a prediction. */ +typedef std::pair<string, float> Prediction; + +class Classifier { + public: + Classifier(const string& model_file, + const string& trained_file, + const string& mean_file, + const string& label_file); + + std::vector<Prediction> Classify(const cv::Mat& img, int N = 5); + +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT + void dump_perf_stat(void); + void dump_single_layer_io(int idx, Layer<float> * p_layer); + void dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time); +#ifdef REPEAT_TEST + void collect_layer_stat(vector<vector<perf_stat> * > & all_stat); + void dump_all_stat(vector <vector<perf_stat>*>& all_stat); + void reset_layer_stat(); +#endif +#endif + +#endif //USE_PROFILING + + private: + void SetMean(const string& mean_file); + + std::vector<float> Predict(const cv::Mat& img); + + void WrapInputLayer(std::vector<cv::Mat>* input_channels); + + void Preprocess(const cv::Mat& img, + std::vector<cv::Mat>* input_channels); + + private: + shared_ptr<Net<float> > net_; + cv::Size input_geometry_; + int num_channels_; + cv::Mat mean_; + std::vector<string> labels_; +}; + +Classifier::Classifier(const string& model_file, + const string& trained_file, + const string& mean_file, + const string& label_file) { +//#ifdef CPU_ONLY +// Caffe::set_mode(Caffe::CPU); +//#else + Caffe::set_mode(Caffe::GPU); //For ARM GPU (the code is in CPU_ONLY mode, just set caffe mode to GPU) +//#endif + + /* Load the network. */ + net_.reset(new Net<float>(model_file, TEST)); + net_->CopyTrainedLayersFrom(trained_file); + + CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input."; + CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output."; + + Blob<float>* input_layer = net_->input_blobs()[0]; + num_channels_ = input_layer->channels(); + CHECK(num_channels_ == 3 || num_channels_ == 1) + << "Input layer should have 1 or 3 channels."; + input_geometry_ = cv::Size(input_layer->width(), input_layer->height()); + + /* Load the binaryproto mean file. */ + SetMean(mean_file); + + /* Load labels. */ + std::ifstream labels(label_file.c_str()); + CHECK(labels) << "Unable to open labels file " << label_file; + string line; + while (std::getline(labels, line)) + labels_.push_back(string(line)); + + Blob<float>* output_layer = net_->output_blobs()[0]; + CHECK_EQ(labels_.size(), output_layer->channels()) + << "Number of labels is different from the output layer dimension."; +} + +static bool PairCompare(const std::pair<float, int>& lhs, + const std::pair<float, int>& rhs) { + return lhs.first > rhs.first; +} + +/* Return the indices of the top N values of vector v. */ +static std::vector<int> Argmax(const std::vector<float>& v, int N) { + std::vector<std::pair<float, int> > pairs; + for (size_t i = 0; i < v.size(); ++i) + pairs.push_back(std::make_pair(v[i], i)); + std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare); + + std::vector<int> result; + for (int i = 0; i < N; ++i) + result.push_back(pairs[i].second); + return result; +} + +/* Return the top N predictions. */ +std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) { + std::vector<float> output = Predict(img); + + N = std::min<int>(labels_.size(), N); + std::vector<int> maxN = Argmax(output, N); + std::vector<Prediction> predictions; + for (int i = 0; i < N; ++i) { + int idx = maxN[i]; + predictions.push_back(std::make_pair(labels_[idx], output[idx])); + } + + return predictions; +} + +/* Load the mean file in binaryproto format. */ +void Classifier::SetMean(const string& mean_file) { + BlobProto blob_proto; + ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); + + /* Convert from BlobProto to Blob<float> */ + Blob<float> mean_blob; + mean_blob.FromProto(blob_proto); + CHECK_EQ(mean_blob.channels(), num_channels_) + << "Number of channels of mean file doesn't match input layer."; + + /* The format of the mean file is planar 32-bit float BGR or grayscale. */ + std::vector<cv::Mat> channels; + float* data = mean_blob.mutable_cpu_data(); + for (int i = 0; i < num_channels_; ++i) { + /* Extract an individual channel. */ + cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data); + channels.push_back(channel); + data += mean_blob.height() * mean_blob.width(); + } + + /* Merge the separate channels into a single image. */ + cv::Mat mean; + cv::merge(channels, mean); + + /* Compute the global mean pixel value and create a mean image + * filled with this value. */ + cv::Scalar channel_mean = cv::mean(mean); + mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean); +} + +std::vector<float> Classifier::Predict(const cv::Mat& img) { + Blob<float>* input_layer = net_->input_blobs()[0]; + input_layer->Reshape(1, num_channels_, + input_geometry_.height, input_geometry_.width); + /* Forward dimension change to all layers. */ + net_->Reshape(); + + std::vector<cv::Mat> input_channels; + WrapInputLayer(&input_channels); + + Preprocess(img, &input_channels); + +#ifdef USE_PROFILING + unsigned long tstart=get_cur_time(); +#endif //USE_PROFILING + + net_->Forward(); + +#ifdef USE_PROFILING + + unsigned long tend=get_cur_time(); + + std::cout<<"used time: "<<tend-tstart<<std::endl; + +#ifdef LAYER_PERF_STAT + dump_perf_stat(); +#ifdef REPEAT_TEST + + reset_layer_stat(); + + vector<vector<perf_stat>* > all_stat; + int rep_number=10; + + for(int i=0;i<rep_number;i++) + { + net_->Forward(); + collect_layer_stat(all_stat); + reset_layer_stat(); + } + + //dump stats + dump_all_stat(all_stat); + + for(int i=0;i<all_stat.size();i++) + delete all_stat[i]; + +#endif //REPEAT_TEST +#endif //LAYER_PERF_STAT +#endif //USE_PROFILING + + /* Copy the output layer to a std::vector */ + Blob<float>* output_layer = net_->output_blobs()[0]; + const float* begin = output_layer->cpu_data(); + const float* end = begin + output_layer->channels(); + return std::vector<float>(begin, end); +} + +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT + +#ifdef REPEAT_TEST +void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat) +{ + vector<perf_stat > * p_stat; + perf_stat * p_time_stat; + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + + + p_stat=new vector<perf_stat>; + + for (int i =0;i< layers.size(); i++) { + p_time_stat=layers[i]->get_time_stat(); + p_stat->push_back(*p_time_stat); + + } + + all_stat.push_back(p_stat); +} + +void Classifier::reset_layer_stat(void) +{ + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + perf_stat * p_time_stat; + + for (int i =0;i< layers.size(); i++) { + p_time_stat=layers[i]->get_time_stat(); + + p_time_stat->count=0; + p_time_stat->total=0; + p_time_stat->used=p_time_stat->start=p_time_stat->end=0; + } +} + +void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat) +{ + + struct new_perf_stat { + perf_stat stat; + int idx; + }; + + vector<new_perf_stat > layer_stat; + perf_stat * p_stat; + + uint64_t total_time=0; + + layer_stat.resize(all_stat[0]->size()); + + for(int i=0;i<all_stat.size();i++) + { + for(int j=0;j<layer_stat.size();j++) + { + p_stat=&layer_stat[j].stat; + + p_stat->total+=(*all_stat[i])[j].total; + p_stat->count+=(*all_stat[i])[j].count; + total_time+=(*all_stat[i])[j].total; + } + } + + total_time=total_time/all_stat.size(); + + std::cout<<std::endl<<"----------------------------------"<<std::endl; + std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl; + std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl; + std::cout<<"Each layer stats: ..."<<std::endl; + + + for(int i=layer_stat.size()-1;i>=0;i--) + { + p_stat=&layer_stat[i].stat; + + layer_stat[i].idx=i; + + std::cout<<" "<<i<<": used time: "<<p_stat->total/all_stat.size(); + std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100; + std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl; + } + + std::cout<<std::endl; + + std::cout<<"time cost top 10 layers are: ..."<<std::endl; + + std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b) + { + if(a.stat.total>b.stat.total) + return true; + else + return false; + }); + + uint64_t top_total_time=0; + + for(int i=0; i<10; i++) + { + p_stat=&layer_stat[i].stat; + + std::cout<<" "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size(); + std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100; + std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl; + top_total_time+=p_stat->total; + } + + std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl; + + std::cout<<std::endl; +} + +#endif + +void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer) +{ + const LayerParameter& layer_param=p_layer->layer_param(); + + std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name(); + std::cout<<" type: "<<layer_param.type()<<std::endl; + + const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom; + + for(int i=0;i<layer_param.bottom_size(); i++) + { + std::cout<<"bottom "<<layer_param.bottom(i)<<": "; + + Blob<float> * p_blob=(*p_bottom_vec)[i]; + + for(int j=0;j<p_blob->num_axes();j++) + { + std::cout<<p_blob->shape(j)<<" "; + } + std::cout<<std::endl; + } + + const vector<Blob<float>*> *p_top_vec=p_layer->saved_top; + for(int i=0;i<layer_param.top_size(); i++) + { + std::cout<<"top "<<layer_param.top(i)<<": "; + Blob<float> * p_blob=(*p_top_vec)[i]; + + for(int j=0;j<p_blob->num_axes();j++) + { + std::cout<<p_blob->shape(j)<<" "; + } + std::cout<<std::endl; + } +} + +void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time) +{ + const LayerParameter& layer_param=p_layer->layer_param(); + perf_stat * p_time_stat; + + p_time_stat=p_layer->get_time_stat(); + + std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name(); + std::cout<<" type: "<<layer_param.type(); + std::cout<<" ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl; + + + std::cout<<"time stat: total: "<<p_time_stat->total<<" count: "<<p_time_stat->count; + if(p_time_stat->count) + { + std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count; + } + + std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end; + std::cout<<std::endl; + + +} + +void Classifier::dump_perf_stat(void) +{ + uint64_t total_net_time=0; + + const vector<shared_ptr<Layer<float> > >& layers=net_->layers(); + + std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl; + + for (int i = layers.size() - 1; i >= 0; --i) { + dump_single_layer_io(i,layers[i].get()); + } + + + for (int i = layers.size() - 1; i >= 0; --i) { + + perf_stat * p_time_stat; + + p_time_stat=layers[i]->get_time_stat(); + + total_net_time+=p_time_stat->total; + + } + + std::cout<<"Time for each layer ... sum of all layers is : "; + std::cout<<total_net_time<<std::endl; + + for (int i = layers.size() - 1; i >= 0; --i) { + + dump_single_layer_perf(i,layers[i].get(),total_net_time); + } + +} + +#endif + +#endif //USE_PROFILING + +/* Wrap the input layer of the network in separate cv::Mat objects + * (one per channel). This way we save one memcpy operation and we + * don't need to rely on cudaMemcpy2D. The last preprocessing + * operation will write the separate channels directly to the input + * layer. */ +void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) { + Blob<float>* input_layer = net_->input_blobs()[0]; + + int width = input_layer->width(); + int height = input_layer->height(); + float* input_data = input_layer->mutable_cpu_data(); + for (int i = 0; i < input_layer->channels(); ++i) { + cv::Mat channel(height, width, CV_32FC1, input_data); + input_channels->push_back(channel); + input_data += width * height; + } +} + +void Classifier::Preprocess(const cv::Mat& img, + std::vector<cv::Mat>* input_channels) { + /* Convert the input image to the input image format of the network. */ + cv::Mat sample; + if (img.channels() == 3 && num_channels_ == 1) + cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY); + else if (img.channels() == 4 && num_channels_ == 1) + cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY); + else if (img.channels() == 4 && num_channels_ == 3) + cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR); + else if (img.channels() == 1 && num_channels_ == 3) + cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR); + else + sample = img; + + cv::Mat sample_resized; + if (sample.size() != input_geometry_) + cv::resize(sample, sample_resized, input_geometry_); + else + sample_resized = sample; + + cv::Mat sample_float; + if (num_channels_ == 3) + sample_resized.convertTo(sample_float, CV_32FC3); + else + sample_resized.convertTo(sample_float, CV_32FC1); + + cv::Mat sample_normalized; + cv::subtract(sample_float, mean_, sample_normalized); + + /* This operation will write the separate BGR planes directly to the + * input layer of the network because it is wrapped by the cv::Mat + * objects in input_channels. */ + cv::split(sample_normalized, *input_channels); + + CHECK(reinterpret_cast<float*>(input_channels->at(0).data) + == net_->input_blobs()[0]->cpu_data()) + << "Input channels are not wrapping the input layer of the network."; +} + +int main(int argc, char** argv) { + if (argc != 6) { + std::cerr << "Usage: " << argv[0] + << " deploy.prototxt network.caffemodel" + << " mean.binaryproto labels.txt img.jpg" << std::endl; + return 1; + } + + ::google::InitGoogleLogging(argv[0]); + + string model_file = argv[1]; + string trained_file = argv[2]; + string mean_file = argv[3]; + string label_file = argv[4]; + Classifier classifier(model_file, trained_file, mean_file, label_file); + + string file = argv[5]; + + std::cout << "---------- Prediction for " + << file << " ----------" << std::endl; + + cv::Mat img = cv::imread(file, -1); + CHECK(!img.empty()) << "Unable to decode image " << file; + std::vector<Prediction> predictions = classifier.Classify(img); + + /* Print the top N predictions. */ + for (size_t i = 0; i < predictions.size(); ++i) { + Prediction p = predictions[i]; + std::cout << std::fixed << std::setprecision(4) << p.second << " - \"" + << p.first << "\"" << std::endl; + } +} +#else +int main(int argc, char** argv) { + LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV."; +} +#endif // USE_OPENCV diff --git a/include/caffe/acl_layer.hpp b/include/caffe/acl_layer.hpp new file mode 100644 index 00000000..db9fee5c --- /dev/null +++ b/include/caffe/acl_layer.hpp @@ -0,0 +1,246 @@ +#ifndef CAFFE_ACL_LAYER_HPP_ +#define CAFFE_ACL_LAYER_HPP_ + +#ifdef USE_ACL +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/CL/functions/CLActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h" +#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h" +#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" +#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" +#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +using namespace arm_compute; +#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001 +#define FLAGS_ENABLE_ACL_BNLL 0x00000002 +#define FLAGS_ENABLE_ACL_CONV 0x00000004 +#define FLAGS_ENABLE_ACL_FC 0x00000008 +#define FLAGS_ENABLE_ACL_LRN 0x00000010 +#define FLAGS_ENABLE_ACL_POOLING 0x00000020 +#define FLAGS_ENABLE_ACL_RELU 0x00000040 +#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080 +#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100 +#define FLAGS_ENABLE_ACL_TANH 0x00000200 +extern unsigned int bypass_acl_class_layer; +#endif +#ifdef USE_PROFILING +#include "layer.hpp" + +#define MASK_LOG_APP_TIME 0x00000001 +#define MASK_LOG_ALLOCATE 0x00000002 +#define MASK_LOG_RUN 0x00000004 +#define MASK_LOG_CONFIG 0x00000008 +#define MASK_LOG_COPY 0x00000010 +#define MASK_LOG_ABSVAL 0x00000020 +#define MASK_LOG_BNLL 0x00000040 +#define MASK_LOG_CONV 0x00000080 +#define MASK_LOG_FC 0x00000100 +#define MASK_LOG_LRN 0x00000200 +#define MASK_LOG_POOLING 0x00000400 +#define MASK_LOG_RELU 0x00000800 +#define MASK_LOG_SIGMOID 0x00001000 +#define MASK_LOG_SOFTMAX 0x00002000 +#define MASK_LOG_TANH 0x00004000 +#define APP_TIME_INFO MASK_LOG_APP_TIME,"time: \t" +#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate: \t\t" +#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t" +#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t" +#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t" +#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t" +#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t" +#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t" +#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t" +#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t" +#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_SIGMOID_INFO MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_SOFTMAX_INFO MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +#define ACL_TANH_INFO MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +extern unsigned int acl_log_flags; +#endif //USE_PROFILING +namespace caffe { +#ifdef USE_ACL +enum TensorType{ + tensor_input, + tensor_output, + tensor_weights, + tensor_biases, +}; +template <typename ACLTensor> +class BaseTensor:public ACLTensor{ +public: + BaseTensor(bool share) + :share_(share),type_(tensor_input),allocate_(false){ + } + virtual void bindmem(void *mem,bool share){ + mem_=mem; + share_=share; + } + virtual void settensortype(TensorType type){ + type_=type; + }; + virtual void map(bool blocking = true){} + virtual void unmap(){} + virtual void commit(); + int tensor_copy(void * mem, bool toTensor=true); +protected: + void* mem_; + bool share_; + TensorType type_; + bool allocate_; +}; +class GPUTensor:public BaseTensor<CLTensor>{ +public: + explicit GPUTensor(bool share) + :BaseTensor(share){} + virtual void map(bool blocking = true){ + if (!allocate_){ + CLTensor::allocator()->allocate(); + allocate_=true; + } + CLTensor::map(blocking); + } + virtual void unmap(){ + CLTensor::unmap(); + } +}; +class CPUTensor:public BaseTensor<Tensor>{ +public: + explicit CPUTensor(bool share) + :BaseTensor(share){} + virtual void map(bool blocking = true){ + if (!allocate_){ + Tensor::allocator()->allocate(); + allocate_=true; + } + } + virtual void unmap(){ + } +}; +template <typename ACLLayer,typename ACLTensor> +class ACLXPUBaseLayer{ +public: + virtual void commit(){ + if (input) { + input->settensortype(tensor_input); + input->commit(); + } + if (output){ + output->settensortype(tensor_output); + output->commit(); + } + if (weights){ + weights->settensortype(tensor_weights); + weights->commit(); + } + if (biases){ + biases->settensortype(tensor_biases); + biases->commit(); + } + } + virtual void run(bool gpu){ + commit(); +#ifdef USE_PROFILING + logtime_util log_time(ACL_RUN_INFO); +#endif //USE_PROFILING + layer->run(); + if (gpu) { + // Make sure all the OpenCL jobs are done executing: + CLScheduler::get().sync(); + } + } + virtual bool reshape(TensorShape &shape,TensorType type); + explicit ACLXPUBaseLayer(){ + layer=nullptr; + input=nullptr; + output=nullptr; + weights=nullptr; + biases=nullptr; +#ifdef USE_CONV_CACHE + for(int i = 0; i < 16; ++i){ + cache.layer[i] = nullptr; + cache.input[i] = nullptr; + cache.output[i] = nullptr; + cache.weights[i] = nullptr; + cache.biases[i] = nullptr; + } +#endif //USE_CONV_CACHE + } + virtual void freelayer(){ +#ifndef USE_CONV_CACHE + if (layer) delete layer; + if (input) delete input; + if (output) delete output; + if (weights) delete weights; + if (biases) delete biases; +#endif //USE_CONV_CACHE + layer=nullptr; + input=nullptr; + output=nullptr; + weights=nullptr; + biases=nullptr; + } + virtual ~ACLXPUBaseLayer(){ + freelayer(); + } + ACLLayer *layer; + ACLTensor *input; + ACLTensor *output; + ACLTensor *weights; + ACLTensor *biases; +#ifdef USE_CONV_CACHE + struct{ + ACLLayer *layer[16]; + ACLTensor *input[16]; + ACLTensor *output[16]; + ACLTensor *weights[16]; + ACLTensor *biases[16]; + }cache; +#endif //USE_CONV_CACHE +}; +template <typename GPULayer, typename CPULayer> +class ACLBaseLayer { +public: + explicit ACLBaseLayer(); + virtual void gpu_run(); + virtual void cpu_run(); + virtual ~ACLBaseLayer(); + virtual GPULayer * new_gpulayer(); + virtual CPULayer * new_cpulayer(); + ACLXPUBaseLayer<GPULayer,GPUTensor>& gpu(){ + return gpu_; + } + ACLXPUBaseLayer<CPULayer,CPUTensor>& cpu(){ + return cpu_; + } + bool checkreshape(TensorShape shape,bool gpu=false, TensorType type=tensor_input); + template <typename ACLTensor> bool tensor_mem(ACLTensor *tensor,void *mem,bool share=false); + template <typename ACLTensor> bool tensor_mem(void *mem,ACLTensor *tensor,bool share=false); + template <typename ACLTensor> ACLTensor * new_tensor(TensorShape shape,void *mem=nullptr,bool share=false); +protected: + ACLXPUBaseLayer<GPULayer,GPUTensor> gpu_; + ACLXPUBaseLayer<CPULayer,CPUTensor> cpu_; + bool init_layer_; + bool force_bypass_acl_path_; + +}; + +#endif +} +#define INSTANTIATE_ACLBASECLASS(GPULayer,CPULayer) \ + template class ACLBaseLayer<GPULayer,CPULayer>; + +#define INSTANTIATE_ACLBASE_FUNCTION(GPULayer,CPULayer,ACLTensor) \ + template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem<ACLTensor>(ACLTensor *tensor,void *mem,bool share); \ + template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share); \ + template ACLTensor * ACLBaseLayer<GPULayer,CPULayer>::new_tensor(TensorShape shape,void *mem,bool share); \ + + +#endif diff --git a/include/caffe/caffe.hpp b/include/caffe/caffe.hpp index 06882096..6ea2bb5d 100644 --- a/include/caffe/caffe.hpp +++ b/include/caffe/caffe.hpp @@ -4,6 +4,12 @@ #ifndef CAFFE_CAFFE_HPP_ #define CAFFE_CAFFE_HPP_ +#ifdef USE_ACL +#ifndef CPU_ONLY +#define CPU_ONLY +#endif +#endif + #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/filler.hpp" diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 4904d1d8..34f2b8c4 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -145,7 +145,13 @@ class Caffe { // into the program since that may cause allocation of pinned memory being // freed in a non-pinned way, which may cause problems - I haven't verified // it personally but better to note it here in the header file. - inline static void set_mode(Brew mode) { Get().mode_ = mode; } +#ifdef USE_ACL + inline static bool arm_gpu_mode() {return Get().use_mali_gpu_;} + inline static void set_mode(Brew mode) { Get().mode_ = CPU; set_arm_gpu_mode(mode==GPU);} + inline static void set_arm_gpu_mode(bool use_mali_gpu) { Get().use_mali_gpu_ = use_mali_gpu;} +#else + inline static void set_mode(Brew mode) { Get().mode_ = mode;} +#endif // Sets the random seed of both boost and curand static void set_random_seed(const unsigned int seed); // Sets the device. Since we have cublas and curand stuff, set device also @@ -175,7 +181,9 @@ class Caffe { shared_ptr<RNG> random_generator_; Brew mode_; - +#ifdef USE_ACL + bool use_mali_gpu_; +#endif // Parallel training int solver_count_; int solver_rank_; diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 30dbfd53..49b1e695 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -10,7 +10,11 @@ #include "caffe/layer_factory.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/util/math_functions.hpp" - +#ifdef USE_PROFILING +#include <sys/time.h> +#define NANO_SEC_CONV 1000000 +extern unsigned int acl_log_flags; +#endif //USE_PROFILING /** Forward declare boost::thread instead of including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010) on OSX. @@ -18,6 +22,64 @@ namespace boost { class mutex; } namespace caffe { +#ifdef USE_PROFILING +class logtime_util +{ + public: + logtime_util(int mask_, const char* information_){ + mask = mask_; + if(acl_log_flags & mask){ + strncpy(information, information_, 255); + gettimeofday(&tv[0], NULL); + } + } + ~logtime_util(){ + if(acl_log_flags & mask){ + long time[2]; + gettimeofday(&tv[1], NULL); + time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec; + time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec; + printf("%s %.6lf\n", information, (((double)time[1] - time[0]) / NANO_SEC_CONV)); + } + } + void log_time(bool start) + { + if(acl_log_flags & mask){ + if (start){ + gettimeofday(&tv[0], NULL); + } + else{ + long time[2]; + gettimeofday(&tv[1], NULL); + time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec; + time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec; + printf("%s %.6lf\n", information, (((double)time[1] - time[0]) / NANO_SEC_CONV)); + } + } + } +private: + struct timeval tv[2]; + int mask; + char information[256]; +}; + +#ifdef LAYER_PERF_STAT + +struct perf_stat { + +uint64_t total; +uint32_t start; +uint32_t end; +uint32_t used; +uint32_t count; + +perf_stat(): total(0),start(0),end(0),count(0){}; + +}; + + +#endif +#endif //USE_PROFILING /** * @brief An interface for the units of computation which can be composed into a @@ -123,8 +185,13 @@ class Layer { * * Your layer should implement Forward_cpu and (optionally) Forward_gpu. */ +#ifdef USE_PROFILING + Dtype Forward(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); +#else inline Dtype Forward(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); +#endif //USE_PROFILING /** * @brief Given the top blob error gradients, compute the bottom blob error @@ -290,7 +357,19 @@ class Layer { } param_propagate_down_[param_id] = value; } + +#ifdef USE_PROFILING +#ifdef LAYER_PERF_STAT + + const vector<Blob<Dtype>*> * saved_top; + const vector<Blob<Dtype>*> * saved_bottom; + perf_stat * get_time_stat(void) { return &time_stat_;} + perf_stat * get_pmu_stat(int index) { return &pmu_stat_[index];} + +#endif + +#endif //USE_PROFILING protected: /** The protobuf that stores the layer parameters */ @@ -404,8 +483,17 @@ class Layer { private: DISABLE_COPY_AND_ASSIGN(Layer); + +#ifdef USE_PROFILING +#ifdef LAYER_PERF_STAT + perf_stat time_stat_; + perf_stat pmu_stat_[16]; +#endif +#endif //USE_PROFILING }; // class Layer + +#ifndef LAYER_PERF_STAT // Forward and backward wrappers. You should implement the cpu and // gpu specific implementations instead, and should not change these // functions. @@ -445,6 +533,8 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom, return loss; } +#endif + template <typename Dtype> inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, diff --git a/include/caffe/layers/acl_absval_layer.hpp b/include/caffe/layers/acl_absval_layer.hpp new file mode 100644 index 00000000..c1655404 --- /dev/null +++ b/include/caffe/layers/acl_absval_layer.hpp @@ -0,0 +1,57 @@ +#ifndef CAFFE_ACL_ABSVAL_LAYER_HPP_ +#define CAFFE_ACL_ABSVAL_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/neuron_layer.hpp" +#include "caffe/layers/absval_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#include "caffe/layers/acl_base_activation_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of AbsValLayer. + * Fallback to AbsValLayer for some corner cases. + */ +template <typename Dtype> +class ACLAbsValLayer : public ACLBaseActivationLayer<Dtype>,public AbsValLayer<Dtype> { + public: + explicit ACLAbsValLayer(const LayerParameter& param) + : ACLBaseActivationLayer<Dtype>(param),AbsValLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLAbsValLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); + +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_ABSVAL_LAYER_HPP_ diff --git a/include/caffe/layers/acl_base_activation_layer.hpp b/include/caffe/layers/acl_base_activation_layer.hpp new file mode 100644 index 00000000..e2abdafa --- /dev/null +++ b/include/caffe/layers/acl_base_activation_layer.hpp @@ -0,0 +1,56 @@ +#ifndef CAFFE_ACL_BASE_ACTIVATION_LAYER_HPP_ +#define CAFFE_ACL_BASE_ACTIVATION_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/neuron_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#include "caffe/layers/acl_base_activation_layer.hpp" +#endif + + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of BNLLLayer. + * Fallback to BNLLLayer for some corner cases. + */ +template <typename Dtype> +class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivationLayer> { + public: + explicit ACLBaseActivationLayer(const LayerParameter& param) + {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLBaseActivationLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type=ActivationLayerInfo::ActivationFunction::RELU); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_BASE_ACTIVATION_LAYER_HPP_ diff --git a/include/caffe/layers/acl_bnll_layer.hpp b/include/caffe/layers/acl_bnll_layer.hpp new file mode 100644 index 00000000..ea2f8a16 --- /dev/null +++ b/include/caffe/layers/acl_bnll_layer.hpp @@ -0,0 +1,57 @@ +#ifndef CAFFE_ACL_BNLL_LAYER_HPP_ +#define CAFFE_ACL_BNLL_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/neuron_layer.hpp" +#include "caffe/layers/bnll_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#include "caffe/layers/acl_base_activation_layer.hpp" +#endif + + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of BNLLLayer. + * Fallback to BNLLLayer for some corner cases. + */ +template <typename Dtype> +class ACLBNLLLayer : public ACLBaseActivationLayer<Dtype>,public BNLLLayer<Dtype> { + public: + explicit ACLBNLLLayer(const LayerParameter& param) + : ACLBaseActivationLayer<Dtype>(param),BNLLLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLBNLLLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_BNLL_LAYER_HPP_ diff --git a/include/caffe/layers/acl_conv_layer.hpp b/include/caffe/layers/acl_conv_layer.hpp new file mode 100644 index 00000000..b4a75848 --- /dev/null +++ b/include/caffe/layers/acl_conv_layer.hpp @@ -0,0 +1,56 @@ +#ifndef CAFFE_ACL_CONV_LAYER_HPP_ +#define CAFFE_ACL_CONV_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/conv_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/* + * @brief ACL implementation of ConvolutionLayer. + * Fallback to ConvolutionLayer for some corner cases. + * +*/ +template <typename Dtype> +class ACLConvolutionLayer : public ACLBaseLayer<CLConvolutionLayer,NEConvolutionLayer>,public ConvolutionLayer<Dtype> { + public: + explicit ACLConvolutionLayer(const LayerParameter& param) + : ConvolutionLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLConvolutionLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_CONV_LAYER_HPP_ diff --git a/include/caffe/layers/acl_inner_product_layer.hpp b/include/caffe/layers/acl_inner_product_layer.hpp new file mode 100644 index 00000000..f42becb0 --- /dev/null +++ b/include/caffe/layers/acl_inner_product_layer.hpp @@ -0,0 +1,54 @@ +#ifndef CAFFE_ACL_INNER_PRODUCT_LAYER_HPP_ +#define CAFFE_ACL_INNER_PRODUCT_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/inner_product_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of InnerProductLayer. + * Fallback to InnerProductLayer for some corner cases. + */ +template <typename Dtype> +class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyConnectedLayer>,public InnerProductLayer<Dtype> { + public: + explicit ACLInnerProductLayer(const LayerParameter& param) + : InnerProductLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLInnerProductLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_INNER_PRODUCT_LAYER_HPP_ diff --git a/include/caffe/layers/acl_lrn_layer.hpp b/include/caffe/layers/acl_lrn_layer.hpp new file mode 100644 index 00000000..6fd9fbc8 --- /dev/null +++ b/include/caffe/layers/acl_lrn_layer.hpp @@ -0,0 +1,54 @@ +#ifndef CAFFE_ACL_LRN_LAYER_HPP_ +#define CAFFE_ACL_LRN_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/lrn_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/* + * @brief ACL implementation of LRNLayer. + * Fallback to LRNLayer for some corner cases. +*/ +template <typename Dtype> +class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLayer>,public LRNLayer<Dtype> { + public: + explicit ACLLRNLayer(const LayerParameter& param) + : LRNLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLLRNLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_LRN_LAYER_HPP_ diff --git a/include/caffe/layers/acl_pooling_layer.hpp b/include/caffe/layers/acl_pooling_layer.hpp new file mode 100644 index 00000000..acca35cf --- /dev/null +++ b/include/caffe/layers/acl_pooling_layer.hpp @@ -0,0 +1,54 @@ +#ifndef CAFFE_ACL_POOLING_LAYER_HPP_ +#define CAFFE_ACL_POOLING_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/pooling_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/* + * @brief ACL implementation of PoolingLayer. + * Fallback to PoolingLayer for some corner cases. +*/ +template <typename Dtype> +class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,public PoolingLayer<Dtype> { + public: + explicit ACLPoolingLayer(const LayerParameter& param) + : PoolingLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLPoolingLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_POOLING_LAYER_HPP_ diff --git a/include/caffe/layers/acl_relu_layer.hpp b/include/caffe/layers/acl_relu_layer.hpp new file mode 100644 index 00000000..041dbecf --- /dev/null +++ b/include/caffe/layers/acl_relu_layer.hpp @@ -0,0 +1,56 @@ +#ifndef CAFFE_ACL_RELU_LAYER_HPP_ +#define CAFFE_ACL_RELU_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/neuron_layer.hpp" +#include "caffe/layers/relu_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#include "caffe/layers/acl_base_activation_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of ReLULayer. + * Fallback to ReLULayer for some corner cases. + */ +template <typename Dtype> +class ACLReLULayer : public ACLBaseActivationLayer<Dtype>,public ReLULayer<Dtype> { + public: + explicit ACLReLULayer(const LayerParameter& param) + : ACLBaseActivationLayer<Dtype>(param), ReLULayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLReLULayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_RELU_LAYER_HPP_ diff --git a/include/caffe/layers/acl_sigmoid_layer.hpp b/include/caffe/layers/acl_sigmoid_layer.hpp new file mode 100644 index 00000000..8638f73e --- /dev/null +++ b/include/caffe/layers/acl_sigmoid_layer.hpp @@ -0,0 +1,55 @@ +#ifndef CAFFE_ACL_SIGMOID_LAYER_HPP_ +#define CAFFE_ACL_SIGMOID_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/neuron_layer.hpp" +#include "caffe/layers/sigmoid_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#include "caffe/layers/acl_base_activation_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of SigmoidLayer. + */ +template <typename Dtype> +class ACLSigmoidLayer : public ACLBaseActivationLayer<Dtype>,public SigmoidLayer<Dtype> { + public: + explicit ACLSigmoidLayer(const LayerParameter& param) + : ACLBaseActivationLayer<Dtype>(param),SigmoidLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLSigmoidLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_SIGMOID_LAYER_HPP_ diff --git a/include/caffe/layers/acl_softmax_layer.hpp b/include/caffe/layers/acl_softmax_layer.hpp new file mode 100644 index 00000000..9e450f5d --- /dev/null +++ b/include/caffe/layers/acl_softmax_layer.hpp @@ -0,0 +1,54 @@ +#ifndef CAFFE_ACL_SOFTMAX_LAYER_HPP_ +#define CAFFE_ACL_SOFTMAX_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/softmax_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL implementation of SoftmaxLayer. + * Fallback to SoftmaxLayer for some corner cases. + */ +template <typename Dtype> +class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,public SoftmaxLayer<Dtype> { + public: + explicit ACLSoftmaxLayer(const LayerParameter& param) + : SoftmaxLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLSoftmaxLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_SOFTMAX_LAYER_HPP_ diff --git a/include/caffe/layers/acl_tanh_layer.hpp b/include/caffe/layers/acl_tanh_layer.hpp new file mode 100644 index 00000000..5a74ce5a --- /dev/null +++ b/include/caffe/layers/acl_tanh_layer.hpp @@ -0,0 +1,56 @@ +#ifndef CAFFE_ACL_TANH_LAYER_HPP_ +#define CAFFE_ACL_TANH_LAYER_HPP_ + +#include <vector> + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" + +#include "caffe/layers/neuron_layer.hpp" +#include "caffe/layers/tanh_layer.hpp" + +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" +#include "caffe/layers/acl_base_activation_layer.hpp" +#endif + +namespace caffe { + +#ifdef USE_ACL +/** + * @brief ACL acceleration of TanHLayer. + * Fallback to TanHLayer for some corner cases. + */ +template <typename Dtype> +class ACLTanHLayer : public ACLBaseActivationLayer<Dtype>,public TanHLayer<Dtype> { + public: + explicit ACLTanHLayer(const LayerParameter& param) + : ACLBaseActivationLayer<Dtype>(param),TanHLayer<Dtype>(param) {} + virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual ~ACLTanHLayer(); + + protected: + virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top); + virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, + const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){ + NOT_IMPLEMENTED; + } + virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type); +}; +#endif + +} // namespace caffe + +#endif // CAFFE_ACL_TANH_LAYER_HPP_ diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp index e3fe4fe2..e7ab4b70 100644 --- a/include/caffe/util/device_alternate.hpp +++ b/include/caffe/util/device_alternate.hpp @@ -7,7 +7,11 @@ // Stub out GPU calls as unavailable. +#ifdef USE_ACL +#define NO_GPU +#else #define NO_GPU LOG(FATAL) << "Cannot use GPU in CPU-only Caffe: check mode." +#endif #define STUB_GPU(classname) \ template <typename Dtype> \ diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp index 71549c1c..dbd8bb6c 100644 --- a/include/caffe/util/hdf5.hpp +++ b/include/caffe/util/hdf5.hpp @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 #ifndef CAFFE_UTIL_HDF5_H_ #define CAFFE_UTIL_HDF5_H_ @@ -37,3 +38,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx); } // namespace caffe #endif // CAFFE_UTIL_HDF5_H_ +#endif // USE_HDF5 diff --git a/src/caffe/acl_layer.cpp b/src/caffe/acl_layer.cpp new file mode 100644 index 00000000..4c20037a --- /dev/null +++ b/src/caffe/acl_layer.cpp @@ -0,0 +1,274 @@ +#ifdef USE_ACL +#include "caffe/acl_layer.hpp" + +unsigned int bypass_acl_class_layer = (0 | \ + /*0xffffffff |*/ \ + /*FLAGS_ENABLE_ACL_FC |*/ \ + /*FLAGS_ENABLE_ACL_LRN |*/ \ + 0 ); + +#ifdef USE_PROFILING + +#include "arm_neon.h" + +unsigned int acl_log_flags = (0 | \ + MASK_LOG_APP_TIME | \ + /*MASK_LOG_ALLOCATE | */\ + /*MASK_LOG_ALLOCATE | */\ + /*MASK_LOG_RUN | */\ + /*MASK_LOG_CONFIG | */\ + /*MASK_LOG_COPY | */\ + MASK_LOG_ABSVAL | \ + MASK_LOG_BNLL | \ + MASK_LOG_CONV | \ + MASK_LOG_FC | \ + MASK_LOG_LRN | \ + MASK_LOG_POOLING | \ + MASK_LOG_RELU | \ + MASK_LOG_SIGMOID | \ + MASK_LOG_SOFTMAX | \ + MASK_LOG_TANH | \ + 0); +#include <stdio.h> /* printf */ +#include <stdlib.h> /* getenv */ +#endif //USE_PROFILING + +namespace caffe { +template <typename GPULayer, typename CPULayer> +ACLBaseLayer<GPULayer,CPULayer>::ACLBaseLayer() + :init_layer_(true),force_bypass_acl_path_(false){ + const char* pBypassACL; + pBypassACL = getenv ("BYPASSACL"); + if (pBypassACL){ + unsigned int bacl; + sscanf(pBypassACL,"%i", &bacl); + if(bacl != bypass_acl_class_layer){ + bypass_acl_class_layer = bacl; + printf("BYPASSACL<%s>\n", pBypassACL); + printf("BYPASSACL: %x\n", bypass_acl_class_layer); + } + } +#ifdef USE_PROFILING + const char* pLogACL; + pLogACL = getenv("LOGACL"); + if (pLogACL){ + unsigned int alf; + sscanf(pLogACL,"%i", &alf); + if (alf != acl_log_flags){ + acl_log_flags = alf; + printf("LOGACL<%s>\n", pLogACL); + printf("LOGACL: %x\n", acl_log_flags); + } + } +#endif //USE_PROFILING +} +template <typename GPULayer, typename CPULayer> +void ACLBaseLayer<GPULayer,CPULayer>::gpu_run() { + gpu_.run(true); +} +template <typename GPULayer, typename CPULayer> +void ACLBaseLayer<GPULayer,CPULayer>::cpu_run() { + cpu_.run(false); +} + +template <typename GPULayer, typename CPULayer> +ACLBaseLayer<GPULayer,CPULayer>::~ACLBaseLayer(){ +} +template <typename GPULayer, typename CPULayer> +template <typename ACLTensor> ACLTensor * ACLBaseLayer<GPULayer,CPULayer>::new_tensor(TensorShape shape,void *mem,bool share) +{ + ACLTensor * tensor=new ACLTensor(share); +#if 1 //F32 + tensor->allocator()->init(TensorInfo(shape, Format::F32)); +#else //F16 + tensor->allocator()->init(TensorInfo(shape, Format::F16)); +#endif + tensor->bindmem(mem,share); + return tensor; +} + +template <typename ACLTensor> +void BaseTensor<ACLTensor>::commit(){ + if (!share_&&mem_) { + if (!allocate_){ +#ifdef USE_PROFILING + logtime_util log_time(ACL_ALLOCATE_INFO); +#endif //USE_PROFILING + ACLTensor::allocator()->allocate(); + allocate_=true; + } + if (type_!= tensor_output) { + tensor_copy(mem_); + } + mem_=nullptr; + } +} + +template <typename ACLTensor> +int BaseTensor<ACLTensor>::tensor_copy(void * mem,bool toTensor) +{ +#ifdef USE_PROFILING + logtime_util log_time(ACL_COPY_INFO); +#endif //USE_PROFILING + arm_compute::Window window; + ACLTensor* tensor=this; + window.use_tensor_dimensions(tensor->info(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element) + int width = tensor->info()->tensor_shape()[0]; //->dimension(0); //window.x().end() - window.x().start(); // + 1; + int height = tensor->info()->tensor_shape()[1]; //->dimension(1); //window.y().end() - window.y().start(); // + 1; + int deepth = tensor->info()->tensor_shape()[2]; + map(); + // Create an iterator: + arm_compute::Iterator it(tensor, window); + // Except it works for an arbitrary number of dimensions + if (toTensor) { //mem->tensor + arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) + { +#if 0 //F16 + if (tensor->info()->element_size() ==2) + { + for(int i = 0; i < width; i+= 4){ + auto pa = (float32x4_t*)((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4); + *(float16x4_t*)(((char*)it.ptr()) + i*2) = vcvt_f16_f32(*pa); + } + } + else{ +#endif + memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size()); +#if 0 //F16 + } +#endif + }, + it); + }else{ //tensor-->mem + arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) + { +#if 0 //F16 + if (tensor->info()->element_size() ==2) + { + for(int i = 0; i < width; i+= 4){ + auto pa = (float32x4_t*)(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4)); + *pa = vcvt_f32_f16(*(float16x4_t*)(((char*)it.ptr()) + i*2)); + } + } + else{ +#endif + memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size()); +#if 0 //F16 + } +#endif + }, + it); + } + unmap(); + + return 0; +} + +template <typename GPULayer, typename CPULayer> +template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share) +{ + tensor->bindmem(mem,share); + return true; +} + +template <typename GPULayer, typename CPULayer> +template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share) +{ + if (mem==tensor->buffer()) return true; + if (!share) { + tensor->tensor_copy(mem,false); + } + return true; +} + + +template <typename GPULayer, typename CPULayer> +bool ACLBaseLayer<GPULayer,CPULayer>::checkreshape(TensorShape shape,bool gpu, TensorType type) +{ + if (gpu) { + init_layer_ = gpu_.reshape(shape,type); + }else{ + init_layer_ = cpu_.reshape(shape,type); + } + return init_layer_; +} + +template <typename GPULayer, typename CPULayer> +GPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_gpulayer(){ + gpu_.layer= new GPULayer; + return gpu_.layer; +} +template <typename GPULayer, typename CPULayer> +CPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_cpulayer(){ + cpu_.layer= new CPULayer; + return cpu_.layer; +} +template <typename ACLLayer,typename ACLTensor> +bool ACLXPUBaseLayer<ACLLayer,ACLTensor>::reshape(TensorShape &shape,TensorType type) +{ + TensorShape _shape; + if (!layer) return true; +#ifdef USE_CONV_CACHE + if (tensor_input == type){ + _shape = input->info()->tensor_shape(); + if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { + return false; + } + for(int i = 0; i < 16; ++i){ + if(cache.input[i] == nullptr) break; + _shape = cache.input[i]->info()->tensor_shape(); + if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { + this->layer = cache.layer[i]; + this->input = cache.input[i]; + this->output = cache.output[i]; + this->weights = cache.weights[i]; + this->biases = cache.biases[i]; + return false; + } + } + } +#endif //USE_CONV_CACHE + switch (type) { + case tensor_biases: + _shape = biases->info()->tensor_shape(); + break; + case tensor_weights: + _shape = weights->info()->tensor_shape(); + break; + case tensor_output: + _shape = output->info()->tensor_shape(); + break; + case tensor_input: + default: + _shape = input->info()->tensor_shape(); + break; + } + if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) { + return false; + } + freelayer(); + return true; +} + +INSTANTIATE_ACLBASECLASS(CLNormalizationLayer,NENormalizationLayer); + INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,GPUTensor); + INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,CPUTensor); +INSTANTIATE_ACLBASECLASS(CLActivationLayer,NEActivationLayer); + INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,GPUTensor); + INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,CPUTensor); +INSTANTIATE_ACLBASECLASS(CLPoolingLayer,NEPoolingLayer); + INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,GPUTensor); + INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,CPUTensor); +INSTANTIATE_ACLBASECLASS(CLSoftmaxLayer,NESoftmaxLayer); + INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,GPUTensor); + INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,CPUTensor); +INSTANTIATE_ACLBASECLASS(CLFullyConnectedLayer,NEFullyConnectedLayer); + INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,GPUTensor); + INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,CPUTensor); +INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEConvolutionLayer); + INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,GPUTensor); + INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,CPUTensor); + +} + +#endif diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 4f6f9bcc..63097caa 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -6,6 +6,10 @@ #include "caffe/common.hpp" #include "caffe/util/rng.hpp" +#ifdef USE_ACL +#include "arm_compute/runtime/CL/CLScheduler.h" +using namespace arm_compute; +#endif namespace caffe { @@ -52,8 +56,12 @@ void GlobalInit(int* pargc, char*** pargv) { #ifdef CPU_ONLY // CPU-only Caffe. Caffe::Caffe() - : random_generator_(), mode_(Caffe::CPU), - solver_count_(1), solver_rank_(0), multiprocess_(false) { } + : random_generator_(), mode_(Caffe::CPU),use_mali_gpu_(false), + solver_count_(1), solver_rank_(0), multiprocess_(false) { +#ifdef USE_ACL + CLScheduler::get().default_init(); +#endif +} Caffe::~Caffe() { } diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp index 684ae88b..677ed100 100644 --- a/src/caffe/layer.cpp +++ b/src/caffe/layer.cpp @@ -1,7 +1,81 @@ #include "caffe/layer.hpp" +#ifdef USE_PROFILING + +#ifdef LAYER_PERF_STAT +#include <time.h> + +#endif +#endif //USE_PROFILING + namespace caffe { INSTANTIATE_CLASS(Layer); +#ifdef USE_PROFILING +#ifdef LAYER_PERF_STAT + +/* current timestamp in us */ +unsigned long get_cur_time(void) +{ + struct timespec tm; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &tm); + + return (tm.tv_sec*1000000+tm.tv_nsec/1000); +} + + +// Forward and backward wrappers. You should implement the cpu and +// gpu specific implementations instead, and should not change these +// functions. +template <typename Dtype> +Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + Dtype loss = 0; + Reshape(bottom, top); + + saved_top=⊤ + saved_bottom=⊥ + + time_stat_.count++; + time_stat_.start=get_cur_time(); + + switch (Caffe::mode()) { + case Caffe::CPU: + Forward_cpu(bottom, top); + for (int top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { continue; } + const int count = top[top_id]->count(); + const Dtype* data = top[top_id]->cpu_data(); + const Dtype* loss_weights = top[top_id]->cpu_diff(); + loss += caffe_cpu_dot(count, data, loss_weights); + } + break; + case Caffe::GPU: + Forward_gpu(bottom, top); +#ifndef CPU_ONLY + for (int top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { continue; } + const int count = top[top_id]->count(); + const Dtype* data = top[top_id]->gpu_data(); + const Dtype* loss_weights = top[top_id]->gpu_diff(); + Dtype blob_loss = 0; + caffe_gpu_dot(count, data, loss_weights, &blob_loss); + loss += blob_loss; + } +#endif + break; + default: + LOG(FATAL) << "Unknown caffe mode."; + } + time_stat_.end=get_cur_time(); + time_stat_.used=time_stat_.end-time_stat_.start; + time_stat_.total+=time_stat_.used; + return loss; +} + +#endif +#endif //USE_PROFILING + } // namespace caffe diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index f14253a5..f7bf8863 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -27,6 +27,19 @@ #include "caffe/layers/cudnn_tanh_layer.hpp" #endif +#ifdef USE_ACL +#include "caffe/layers/acl_absval_layer.hpp" +#include "caffe/layers/acl_bnll_layer.hpp" +#include "caffe/layers/acl_conv_layer.hpp" +#include "caffe/layers/acl_inner_product_layer.hpp" +#include "caffe/layers/acl_lrn_layer.hpp" +#include "caffe/layers/acl_pooling_layer.hpp" +#include "caffe/layers/acl_relu_layer.hpp" +#include "caffe/layers/acl_sigmoid_layer.hpp" +#include "caffe/layers/acl_softmax_layer.hpp" +#include "caffe/layers/acl_tanh_layer.hpp" +#endif + #ifdef WITH_PYTHON_LAYER #include "caffe/layers/python_layer.hpp" #endif @@ -39,6 +52,9 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer( const LayerParameter& param) { ConvolutionParameter conv_param = param.convolution_param(); ConvolutionParameter_Engine engine = conv_param.engine(); +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype>(param)); +#endif #ifdef USE_CUDNN bool use_dilation = false; for (int i = 0; i < conv_param.dilation_size(); ++i) { @@ -77,6 +93,9 @@ REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer); template <typename Dtype> shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) { PoolingParameter_Engine engine = param.pooling_param().engine(); +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLPoolingLayer<Dtype>(param)); +#endif if (engine == PoolingParameter_Engine_DEFAULT) { engine = PoolingParameter_Engine_CAFFE; #ifdef USE_CUDNN @@ -115,7 +134,9 @@ REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer); template <typename Dtype> shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) { LRNParameter_Engine engine = param.lrn_param().engine(); - +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLLRNLayer<Dtype>(param)); +#endif if (engine == LRNParameter_Engine_DEFAULT) { #ifdef USE_CUDNN engine = LRNParameter_Engine_CUDNN; @@ -153,6 +174,9 @@ REGISTER_LAYER_CREATOR(LRN, GetLRNLayer); template <typename Dtype> shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) { ReLUParameter_Engine engine = param.relu_param().engine(); +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLReLULayer<Dtype>(param)); +#endif if (engine == ReLUParameter_Engine_DEFAULT) { engine = ReLUParameter_Engine_CAFFE; #ifdef USE_CUDNN @@ -177,6 +201,9 @@ REGISTER_LAYER_CREATOR(ReLU, GetReLULayer); template <typename Dtype> shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) { SigmoidParameter_Engine engine = param.sigmoid_param().engine(); +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLSigmoidLayer<Dtype>(param)); +#endif if (engine == SigmoidParameter_Engine_DEFAULT) { engine = SigmoidParameter_Engine_CAFFE; #ifdef USE_CUDNN @@ -201,6 +228,9 @@ REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer); template <typename Dtype> shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) { SoftmaxParameter_Engine engine = param.softmax_param().engine(); +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLSoftmaxLayer<Dtype>(param)); +#endif if (engine == SoftmaxParameter_Engine_DEFAULT) { engine = SoftmaxParameter_Engine_CAFFE; #ifdef USE_CUDNN @@ -225,6 +255,9 @@ REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer); template <typename Dtype> shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) { TanHParameter_Engine engine = param.tanh_param().engine(); +#ifdef USE_ACL + return shared_ptr<Layer<Dtype> >(new ACLTanHLayer<Dtype>(param)); +#endif if (engine == TanHParameter_Engine_DEFAULT) { engine = TanHParameter_Engine_CAFFE; #ifdef USE_CUDNN @@ -245,6 +278,33 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) { REGISTER_LAYER_CREATOR(TanH, GetTanHLayer); +#ifdef USE_ACL +// Get AbsVal layer according to engine. +template <typename Dtype> +shared_ptr<Layer<Dtype> > GetAbsValLayer(const LayerParameter& param) { + return shared_ptr<Layer<Dtype> >(new ACLAbsValLayer<Dtype>(param)); +} + +REGISTER_LAYER_CREATOR(AbsVal, GetAbsValLayer); + +// Get BNLL layer according to engine. +template <typename Dtype> +shared_ptr<Layer<Dtype> > GetBNLLLayer(const LayerParameter& param) { + return shared_ptr<Layer<Dtype> >(new ACLBNLLLayer<Dtype>(param)); +} + +REGISTER_LAYER_CREATOR(BNLL, GetBNLLLayer); + +// Get InnerProduct layer according to engine. +template <typename Dtype> +shared_ptr<Layer<Dtype> > GetInnerProductLayer(const LayerParameter& param) { + return shared_ptr<Layer<Dtype> >(new ACLInnerProductLayer<Dtype>(param)); +} + +REGISTER_LAYER_CREATOR(InnerProduct, GetInnerProductLayer); + +#endif // USE_ACL + #ifdef WITH_PYTHON_LAYER template <typename Dtype> shared_ptr<Layer<Dtype> > GetPythonLayer(const LayerParameter& param) { diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 855bf0bf..65980e4d 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -39,6 +39,8 @@ STUB_GPU(AbsValLayer); #endif INSTANTIATE_CLASS(AbsValLayer); +#ifndef USE_ACL REGISTER_LAYER_CLASS(AbsVal); +#endif } // namespace caffe diff --git a/src/caffe/layers/acl_absval_layer.cpp b/src/caffe/layers/acl_absval_layer.cpp new file mode 100644 index 00000000..b0b0304d --- /dev/null +++ b/src/caffe/layers/acl_absval_layer.cpp @@ -0,0 +1,64 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/layers/acl_absval_layer.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLAbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + AbsValLayer<Dtype>::LayerSetUp(bottom, top); + ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_ABSVAL; +} + +template <typename Dtype> +void ACLAbsValLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::ABS); +} + +template <typename Dtype> +void ACLAbsValLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + AbsValLayer<Dtype>::Reshape(bottom, top); + ACLBaseActivationLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLAbsValLayer<Dtype>::Forward_cpu( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_ABSVAL_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + AbsValLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top); +} + +template <typename Dtype> +void ACLAbsValLayer<Dtype>::Forward_gpu( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_ABSVAL_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + AbsValLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top); +} + +template <typename Dtype> +ACLAbsValLayer<Dtype>::~ACLAbsValLayer() { +} + +INSTANTIATE_CLASS(ACLAbsValLayer); + +} // namespace caffe + +#endif // USE_ACL diff --git a/src/caffe/layers/acl_base_activation_layer.cpp b/src/caffe/layers/acl_base_activation_layer.cpp new file mode 100644 index 00000000..2595a951 --- /dev/null +++ b/src/caffe/layers/acl_base_activation_layer.cpp @@ -0,0 +1,97 @@ +#ifdef USE_ACL +#include <algorithm> +#include <vector> + +#include "caffe/layers/acl_base_activation_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLBaseActivationLayer<Dtype>::LayerSetUp( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { +} +template <typename Dtype> +void ACLBaseActivationLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){ + + const unsigned int count = bottom[0]->count(); + const unsigned int count_ = top[0]->count(); + TensorShape input_shape(count); + TensorShape output_shape(count_); + checkreshape(input_shape,Caffe::arm_gpu_mode()); + if (!this->init_layer_) return; + this->init_layer_=false; + // Initialize ACL. + if (Caffe::arm_gpu_mode()) { + new_gpulayer(); + }else{ + new_cpulayer(); + } + + this->force_bypass_acl_path_=false; + ActivationLayerInfo act_info(type); + + if(type== ActivationLayerInfo::ActivationFunction::TANH) + act_info=ActivationLayerInfo(type,1.0,1.0); + + + + if (Caffe::arm_gpu_mode()) { + Dtype *top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + this->gpu().input=new_tensor<GPUTensor>(input_shape,(void*)bottom_data); + this->gpu().output=new_tensor<GPUTensor>(output_shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->gpu().layer->configure(this->gpu().input,this->gpu().output,act_info); + }else{ + Dtype *top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + this->cpu().input=new_tensor<CPUTensor>(input_shape,(void*)bottom_data); + this->cpu().output=new_tensor<CPUTensor>(output_shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->cpu().layer->configure(this->cpu().input,this->cpu().output,act_info); + } +} +template <typename Dtype> +void ACLBaseActivationLayer<Dtype>::Reshape( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { +} + +template <typename Dtype> +void ACLBaseActivationLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + if(Caffe::arm_gpu_mode()){ + Forward_gpu(bottom, top); + return; + } + Dtype* top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + SetupACLLayer(bottom,top); + tensor_mem(this->cpu().input,(void*)(bottom_data)); + cpu_run(); + tensor_mem((void*)(top_data),this->cpu().output); +} + +template <typename Dtype> +void ACLBaseActivationLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + SetupACLLayer(bottom,top); + tensor_mem(this->gpu().input,(void*)(bottom_data)); + gpu_run(); + tensor_mem((void*)(top_data),this->gpu().output); +} + +template <typename Dtype> +ACLBaseActivationLayer<Dtype>::~ACLBaseActivationLayer() { +} + +INSTANTIATE_CLASS(ACLBaseActivationLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_bnll_layer.cpp b/src/caffe/layers/acl_bnll_layer.cpp new file mode 100644 index 00000000..86f09831 --- /dev/null +++ b/src/caffe/layers/acl_bnll_layer.cpp @@ -0,0 +1,61 @@ +#ifdef USE_ACL +#include <algorithm> +#include <vector> + +#include "caffe/layers/acl_bnll_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLBNLLLayer<Dtype>::LayerSetUp( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + BNLLLayer<Dtype>::LayerSetUp(bottom, top); + ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BNLL; +} +template <typename Dtype> +void ACLBNLLLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::SOFT_RELU); +} +template <typename Dtype> +void ACLBNLLLayer<Dtype>::Reshape( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + BNLLLayer<Dtype>::Reshape(bottom, top); + ACLBaseActivationLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLBNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_BNLL_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + BNLLLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top); +} + +template <typename Dtype> +void ACLBNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_BNLL_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + BNLLLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top); +} + +template <typename Dtype> +ACLBNLLLayer<Dtype>::~ACLBNLLLayer() { +} + +INSTANTIATE_CLASS(ACLBNLLLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_conv_layer.cpp b/src/caffe/layers/acl_conv_layer.cpp new file mode 100644 index 00000000..02732fb8 --- /dev/null +++ b/src/caffe/layers/acl_conv_layer.cpp @@ -0,0 +1,218 @@ +#ifdef USE_ACL +#include <algorithm> +#include <vector> + +#include "caffe/filler.hpp" +#include "caffe/layers/acl_conv_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::LayerSetUp( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV; +} + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + + TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num()); + checkreshape(input_shape,Caffe::arm_gpu_mode()); + if (!this->init_layer_) return; + this->init_layer_=false; + // Initialize ACL. + if (Caffe::arm_gpu_mode()) { + new_gpulayer(); + }else{ + new_cpulayer(); + } + this->force_bypass_acl_path_=false; + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + int stride_x =this->stride_.mutable_cpu_data()[1]; + int stride_y =this->stride_.mutable_cpu_data()[0]; + int pad_x=this->pad_.mutable_cpu_data()[1]; + int pad_y=this->pad_.mutable_cpu_data()[0]; + unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1]; + unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0]; + PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y); + TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_); + TensorShape biases_shape ((unsigned int)this->num_output_); + TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num()); + + if (Caffe::arm_gpu_mode()) { + Dtype *top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + //[kernel_x, kernel_y, IFM, OFM] + this->gpu().weights=new_tensor<GPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data())); + tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data())); + //[OFM] + if (this->bias_term_) { + this->gpu().biases=new_tensor<GPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data())); + tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data())); + } + + //[width, height, IFM] + this->gpu().input=new_tensor<GPUTensor>(input_shape,(void*)bottom_data); + //[width, height, OFM] + this->gpu().output=new_tensor<GPUTensor>(output_shape,(void*)top_data); +#ifdef USE_PROFILING + { + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info); +#ifdef USE_PROFILING + } +#endif //USE_PROFILING +#ifdef USE_CONV_CACHE + for(int i = 0; i < 16; ++i){ + fprintf(stderr, "<GPU>check cache[%d]\n", i); + if(this->gpu().cache.layer[i] == nullptr){ + this->gpu().cache.layer[i] = this->gpu().layer; + this->gpu().cache.input[i] = this->gpu().input; + this->gpu().cache.output[i] = this->gpu().output; + this->gpu().cache.weights[i] = this->gpu().weights; + this->gpu().cache.biases[i] = this->gpu().biases; + break; + } + } +#endif //USE_CONV_CACHE + }else{ + Dtype *top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + //[kernel_x, kernel_y, IFM, OFM] + this->cpu().weights=new_tensor<CPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data())); + tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data())); + //[OFM] + if (this->bias_term_) { + this->cpu().biases=new_tensor<CPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data())); + tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data())); + } + + //[width, height, IFM] + this->cpu().input=new_tensor<CPUTensor>(input_shape,(void*)bottom_data); + //[width, height, OFM] + this->cpu().output=new_tensor<CPUTensor>(output_shape,(void*)top_data); +#ifdef USE_PROFILING + { + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info); +#ifdef USE_PROFILING + } +#endif //USE_PROFILING +#ifdef USE_CONV_CACHE + for(int i = 0; i < 16; ++i){ + fprintf(stderr, "<CPU>check cache[%d]\n", i); + if(this->cpu().cache.layer[i] == nullptr){ + this->cpu().cache.layer[i] = this->cpu().layer; + this->cpu().cache.input[i] = this->cpu().input; + this->cpu().cache.output[i] = this->cpu().output; + this->cpu().cache.weights[i] = this->cpu().weights; + this->cpu().cache.biases[i] = this->cpu().biases; + break; + } + } +#endif //USE_CONV_CACHE + } +} +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::Reshape( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + ConvolutionLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::Forward_cpu( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { + if(Caffe::arm_gpu_mode()){ + Forward_gpu(bottom, top); + return; + } +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONV_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_|| this->group_!=1) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + /* check dilation */ + int dilated=0; + + for(int i=0;i<this->num_spatial_axes_;i++) + { + const int *p=this->dilation_.cpu_data(); + + if(p[i]!=1) + dilated=1; + } + if(dilated) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + + SetupACLLayer(bottom,top); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->cpu_data(); + Dtype* top_data = top[i]->mutable_cpu_data(); + tensor_mem(this->cpu().input,(void*)bottom_data); + cpu_run(); + tensor_mem((void*)top_data,this->cpu().output); + } +} + +template <typename Dtype> +void ACLConvolutionLayer<Dtype>::Forward_gpu( + const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONV_INFO); +#endif //USE_PROFILING + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + if (this->force_bypass_acl_path_|| this->group_!=1) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 ) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + /* check dilation */ + int dilated=0; + + for(int i=0;i<this->num_spatial_axes_;i++) + { + const int *p=this->dilation_.gpu_data(); + + if(p[i]!=1) + dilated=1; + } + + if(dilated) { + ConvolutionLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + SetupACLLayer(bottom,top); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* top_data = top[i]->mutable_gpu_data(); + tensor_mem(this->gpu().input,(void*)bottom_data); + gpu_run(); + tensor_mem((void*)top_data,this->gpu().output); + } +} + +template <typename Dtype> +ACLConvolutionLayer<Dtype>::~ACLConvolutionLayer() { +} + +INSTANTIATE_CLASS(ACLConvolutionLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_inner_product_layer.cpp b/src/caffe/layers/acl_inner_product_layer.cpp new file mode 100644 index 00000000..6e7a46e9 --- /dev/null +++ b/src/caffe/layers/acl_inner_product_layer.cpp @@ -0,0 +1,131 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/filler.hpp" +#include "caffe/layers/acl_inner_product_layer.hpp" +#include "caffe/util/math_functions.hpp" + + +namespace caffe { + +template <typename Dtype> +void ACLInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + InnerProductLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_FC; +} +template <typename Dtype> +void ACLInnerProductLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + + TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_); + TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_); + TensorShape biases_shape((unsigned int)this->N_); + TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_); + TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_); + checkreshape(input_shape,Caffe::arm_gpu_mode()); + if (!this->init_layer_) return; + this->init_layer_=false; + // Initialize ACL. + if (Caffe::arm_gpu_mode()) { + new_gpulayer(); + }else{ + new_cpulayer(); + } + + bool transpose = !this->layer_param_.inner_product_param().transpose(); + this->force_bypass_acl_path_ = false; + if (Caffe::arm_gpu_mode()) { + Dtype *top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + if (transpose) { + this->gpu().weights=new_tensor<GPUTensor>(weights_shape_t,(void*)(this->blobs_[0].get()->mutable_gpu_data())); + }else{ + this->gpu().weights=new_tensor<GPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data())); + } + tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data())); + if (this->bias_term_) { + this->gpu().biases=new_tensor<GPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data())); + tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data())); + } + this->gpu().input=new_tensor<GPUTensor>(input_shape,(void*)bottom_data); + this->gpu().output=new_tensor<GPUTensor>(output_shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,transpose); + }else{ + Dtype *top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + if (transpose) { + this->cpu().weights=new_tensor<CPUTensor>(weights_shape_t,(void*)(this->blobs_[0].get()->mutable_cpu_data())); + }else{ + this->cpu().weights=new_tensor<CPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data())); + } + tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data())); + if (this->bias_term_) { + this->cpu().biases=new_tensor<CPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data())); + tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data())); + } + this->cpu().input=new_tensor<CPUTensor>(input_shape,(void*)bottom_data); + this->cpu().output=new_tensor<CPUTensor>(output_shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,transpose); + } +} +template <typename Dtype> +void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + InnerProductLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + if(Caffe::arm_gpu_mode()){ + Forward_gpu(bottom, top); + return; + } +#ifdef USE_PROFILING + logtime_util log_time(ACL_FC_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + InnerProductLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + Dtype* top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + SetupACLLayer(bottom,top); + tensor_mem(this->cpu().input,(void*)(bottom_data)); + cpu_run(); + tensor_mem((void*)(top_data),this->cpu().output); +} + +template <typename Dtype> +void ACLInnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_FC_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + InnerProductLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + SetupACLLayer(bottom,top); + tensor_mem(this->gpu().input,(void*)(bottom_data)); + gpu_run(); + tensor_mem((void*)(top_data),this->gpu().output); +} + +template <typename Dtype> +ACLInnerProductLayer<Dtype>::~ACLInnerProductLayer() { +} + +INSTANTIATE_CLASS(ACLInnerProductLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_lrn_layer.cpp b/src/caffe/layers/acl_lrn_layer.cpp new file mode 100644 index 00000000..9c03cad1 --- /dev/null +++ b/src/caffe/layers/acl_lrn_layer.cpp @@ -0,0 +1,144 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/layers/acl_lrn_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + LRNLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN; +} +template <typename Dtype> +void ACLLRNLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + + TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_); + checkreshape(shape,Caffe::arm_gpu_mode()); + if (!this->init_layer_) return; + // Initialize ACL. + if (Caffe::arm_gpu_mode()) { + new_gpulayer(); + }else{ + new_cpulayer(); + } + + //this->force_bypass_acl_path_=false; + NormalizationLayerInfo *norm_info; + if(this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) + norm_info=new NormalizationLayerInfo(NormType::IN_MAP, this->size_, this->alpha_, this->beta_, this->k_); + else + norm_info=new NormalizationLayerInfo(NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_); + + if (Caffe::arm_gpu_mode()) { + Dtype *top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + this->gpu().input=new_tensor<GPUTensor>(shape,(void*)bottom_data); + this->gpu().output=new_tensor<GPUTensor>(shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->gpu().layer->configure(this->gpu().input,this->gpu().output,*norm_info); + }else{ + Dtype *top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + this->cpu().input=new_tensor<CPUTensor>(shape,(void*)bottom_data); + this->cpu().output=new_tensor<CPUTensor>(shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->cpu().layer->configure(this->cpu().input,this->cpu().output,*norm_info); + } + delete norm_info; +} +template <typename Dtype> +void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + LRNLayer<Dtype>::Reshape(bottom, top); + return; +} + +template <typename Dtype> +void ACLLRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + if(Caffe::arm_gpu_mode()){ + Forward_gpu(bottom, top); + return; + } +#ifdef USE_PROFILING + logtime_util log_time(ACL_LRN_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) { + LRNLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + SetupACLLayer(bottom,top); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + for (int n = 0; n < this->num_; ++n) { + tensor_mem(this->cpu().input,(void*)(bottom_data+ bottom[0]->offset(n))); + cpu_run(); + tensor_mem((void*)(top_data + top[0]->offset(n)),this->cpu().output); + } + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + for (int n = 0; n < bottom[0]->num(); ++n) { + tensor_mem(this->cpu().input,(void*)(bottom_data)); + cpu_run(); + tensor_mem((void*)(top_data),this->cpu().output); + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + } + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +template <typename Dtype> +void ACLLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_LRN_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + LRNLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + SetupACLLayer(bottom,top); + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + for (int n = 0; n < this->num_; ++n) { + tensor_mem(this->gpu().input,(void*)(bottom_data+ bottom[0]->offset(n))); + gpu_run(); + tensor_mem((void*)(top_data + top[0]->offset(n)),this->gpu().output); + } + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + for (int n = 0; n < bottom[0]->num(); ++n) { + tensor_mem(this->gpu().input,(void*)(bottom_data)); + gpu_run(); + tensor_mem((void*)(top_data),this->gpu().output); + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + } + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +template <typename Dtype> +ACLLRNLayer<Dtype>::~ACLLRNLayer() { +} + +INSTANTIATE_CLASS(ACLLRNLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_pooling_layer.cpp b/src/caffe/layers/acl_pooling_layer.cpp new file mode 100644 index 00000000..951de3e4 --- /dev/null +++ b/src/caffe/layers/acl_pooling_layer.cpp @@ -0,0 +1,150 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/layers/acl_pooling_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + PoolingLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING; +} +template <typename Dtype> +void ACLPoolingLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + + TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_); + TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_); + checkreshape(in_shape,Caffe::arm_gpu_mode()); + if (!this->init_layer_) return; + this->init_layer_=false; + // Initialize ACL. + if (Caffe::arm_gpu_mode()) { + new_gpulayer(); + }else{ + new_cpulayer(); + } + + this->force_bypass_acl_path_=false; + PoolingLayerInfo *pool_info; + if(this->layer_param_.pooling_param().pool()==PoolingParameter_PoolMethod_MAX) + pool_info=new PoolingLayerInfo(PoolingType::MAX, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL)); + else + pool_info=new PoolingLayerInfo(PoolingType::AVG, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL)); + + if (Caffe::arm_gpu_mode()) { + Dtype *top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + this->gpu().input=new_tensor<GPUTensor>(in_shape,(void*)bottom_data); + this->gpu().output=new_tensor<GPUTensor>(out_shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->gpu().layer->configure(this->gpu().input,this->gpu().output,*pool_info); + }else{ + Dtype *top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + this->cpu().input=new_tensor<CPUTensor>(in_shape,(void*)bottom_data); + this->cpu().output=new_tensor<CPUTensor>(out_shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->cpu().layer->configure(this->cpu().input,this->cpu().output,*pool_info); + } + delete pool_info; +} +template <typename Dtype> +void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + PoolingLayer<Dtype>::Reshape(bottom, top); + +} + +template <typename Dtype> +void ACLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + if(Caffe::arm_gpu_mode()){ + Forward_gpu(bottom, top); + return; + } +#ifdef USE_PROFILING + logtime_util log_time(ACL_POOLING_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && + this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + if (this->kernel_h_!=this->kernel_w_ || top.size()>1) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + if (this->kernel_h_!=2 && this->kernel_h_!=3) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + SetupACLLayer(bottom,top); + for (int n = 0; n < bottom[0]->num(); ++n) { + for (int c = 0; c < this->channels_; ++c) { + tensor_mem(this->cpu().input,(void*)(bottom_data)); + cpu_run(); + tensor_mem((void*)(top_data),this->cpu().output); + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + } + } +} + +template <typename Dtype> +void ACLPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_POOLING_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && + this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + if (this->kernel_h_!=this->kernel_w_) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + if (this->kernel_h_!=2 && this->kernel_h_!=3) { + PoolingLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + SetupACLLayer(bottom,top); + for (int n = 0; n < bottom[0]->num(); ++n) { + for (int c = 0; c < this->channels_; ++c) { + tensor_mem(this->gpu().input,(void*)(bottom_data)); + gpu_run(); + tensor_mem((void*)(top_data),this->gpu().output); + bottom_data += bottom[0]->offset(0, 1); + top_data += top[0]->offset(0, 1); + } + } +} + +template <typename Dtype> +ACLPoolingLayer<Dtype>::~ACLPoolingLayer() { +} + +INSTANTIATE_CLASS(ACLPoolingLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_relu_layer.cpp b/src/caffe/layers/acl_relu_layer.cpp new file mode 100644 index 00000000..03194539 --- /dev/null +++ b/src/caffe/layers/acl_relu_layer.cpp @@ -0,0 +1,70 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/layers/acl_relu_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + ReLULayer<Dtype>::LayerSetUp(bottom, top); + ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU; +} +template <typename Dtype> +void ACLReLULayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::RELU); +} +template <typename Dtype> +void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + ReLULayer<Dtype>::Reshape(bottom, top); + ACLBaseActivationLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_RELU_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + ReLULayer<Dtype>::Forward_cpu(bottom,top); + return; + } + // Fallback to standard Caffe for leaky ReLU. + if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) { + ReLULayer<Dtype>::Forward_cpu(bottom, top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top); +} + +template <typename Dtype> +void ACLReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_RELU_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + ReLULayer<Dtype>::Forward_cpu(bottom,top); + return; + } + // Fallback to standard Caffe for leaky ReLU. + if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) { + ReLULayer<Dtype>::Forward_cpu(bottom, top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top); +} + +template <typename Dtype> +ACLReLULayer<Dtype>::~ACLReLULayer() { +} + +INSTANTIATE_CLASS(ACLReLULayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_sigmoid_layer.cpp b/src/caffe/layers/acl_sigmoid_layer.cpp new file mode 100644 index 00000000..eac15651 --- /dev/null +++ b/src/caffe/layers/acl_sigmoid_layer.cpp @@ -0,0 +1,61 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/layers/acl_sigmoid_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + SigmoidLayer<Dtype>::LayerSetUp(bottom, top); + ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SIGMOID; +} + +template <typename Dtype> +void ACLSigmoidLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::LOGISTIC); +} +template <typename Dtype> +void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + SigmoidLayer<Dtype>::Reshape(bottom, top); + ACLBaseActivationLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLSigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_SIGMOID_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + SigmoidLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top); +} + +template <typename Dtype> +void ACLSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_SIGMOID_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + SigmoidLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top); +} + +template <typename Dtype> +ACLSigmoidLayer<Dtype>::~ACLSigmoidLayer() { +} + +INSTANTIATE_CLASS(ACLSigmoidLayer); + +} // namespace caffe +#endif // USE_ACL diff --git a/src/caffe/layers/acl_softmax_layer.cpp b/src/caffe/layers/acl_softmax_layer.cpp new file mode 100644 index 00000000..d32460b9 --- /dev/null +++ b/src/caffe/layers/acl_softmax_layer.cpp @@ -0,0 +1,117 @@ +#ifdef USE_ACL +#include <vector> +#include "caffe/layers/acl_softmax_layer.hpp" +#include <unistd.h> + + +namespace caffe { + +template <typename Dtype> +void ACLSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + SoftmaxLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX; +} +template <typename Dtype> +void ACLSoftmaxLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top){ + + unsigned int channels = bottom[0]->shape(this->softmax_axis_); + TensorShape shape(channels*this->inner_num_); + checkreshape(shape,Caffe::arm_gpu_mode()); + if (!this->init_layer_) return; + this->init_layer_=false; + + // Initialize ACL. + if (Caffe::arm_gpu_mode()) { + new_gpulayer(); + }else{ + new_cpulayer(); + } + + //this->force_bypass_acl_path_=false; + if (Caffe::arm_gpu_mode()) { + Dtype *top_data = top[0]->mutable_gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + this->gpu().input=new_tensor<GPUTensor>(shape,(void*)bottom_data); + this->gpu().output=new_tensor<GPUTensor>(shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->gpu().layer->configure(this->gpu().input,this->gpu().output); + }else{ + Dtype *top_data = top[0]->mutable_cpu_data(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + this->cpu().input=new_tensor<CPUTensor>(shape,(void*)bottom_data); + this->cpu().output=new_tensor<CPUTensor>(shape,(void*)top_data); +#ifdef USE_PROFILING + logtime_util log_time(ACL_CONFIG_INFO); +#endif //USE_PROFILING + this->cpu().layer->configure(this->cpu().input,this->cpu().output); + } +} +template <typename Dtype> +void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + SoftmaxLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLSoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + if(Caffe::arm_gpu_mode()){ + Forward_gpu(bottom, top); + return; + } +#ifdef USE_PROFILING + logtime_util log_time(ACL_SOFTMAX_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_ || this->inner_num_>1) { + SoftmaxLayer<Dtype>::Forward_cpu(bottom,top); + return ; + } + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + SetupACLLayer(bottom,top); + + int channels = bottom[0]->shape(this->softmax_axis_); + + for (int i = 0; i < this->outer_num_; ++i) { + tensor_mem(this->cpu().input,(void*)(bottom_data)); + cpu_run(); + tensor_mem((void*)(top_data),this->cpu().output); + top_data += channels; + bottom_data += channels; + } +} + +template <typename Dtype> +void ACLSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_SOFTMAX_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_|| this->inner_num_>1) { + SoftmaxLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + SetupACLLayer(bottom,top); + for (int i = 0; i < this->outer_num_; ++i) { + tensor_mem(this->gpu().input,(void*)(bottom_data)); + gpu_run(); + tensor_mem((void*)(top_data),this->gpu().output); + top_data += this->inner_num_; + bottom_data += this->inner_num_; + } +} + +template <typename Dtype> +ACLSoftmaxLayer<Dtype>::~ACLSoftmaxLayer() { +} + +INSTANTIATE_CLASS(ACLSoftmaxLayer); +} // namespace caffe + +#endif // USE_ACL diff --git a/src/caffe/layers/acl_tanh_layer.cpp b/src/caffe/layers/acl_tanh_layer.cpp new file mode 100644 index 00000000..a1bb632c --- /dev/null +++ b/src/caffe/layers/acl_tanh_layer.cpp @@ -0,0 +1,63 @@ +#ifdef USE_ACL +#include <vector> + +#include "caffe/layers/acl_tanh_layer.hpp" + +namespace caffe { + +template <typename Dtype> +void ACLTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + TanHLayer<Dtype>::LayerSetUp(bottom, top); + ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top); + this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_TANH; +} + +template <typename Dtype> +void ACLTanHLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){ + ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::TANH); +} + +template <typename Dtype> +void ACLTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { + TanHLayer<Dtype>::Reshape(bottom, top); + ACLBaseActivationLayer<Dtype>::Reshape(bottom, top); +} + +template <typename Dtype> +void ACLTanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_TANH_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + TanHLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top); +} + +template <typename Dtype> +void ACLTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, + const vector<Blob<Dtype>*>& top) { +#ifdef USE_PROFILING + logtime_util log_time(ACL_TANH_INFO); +#endif //USE_PROFILING + if (this->force_bypass_acl_path_) { + TanHLayer<Dtype>::Forward_cpu(bottom,top); + return; + } + ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top); +} + +template <typename Dtype> +ACLTanHLayer<Dtype>::~ACLTanHLayer() { +} + +INSTANTIATE_CLASS(ACLTanHLayer); + +} // namespace caffe + +#endif // USE_ACL diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 448d86d7..5293373d 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -42,6 +42,8 @@ STUB_GPU(BNLLLayer); #endif INSTANTIATE_CLASS(BNLLLayer); +#ifndef USE_ACL REGISTER_LAYER_CLASS(BNLL); +#endif } // namespace caffe diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 00716a92..7668854c 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 /* TODO: - load file in a separate thread ("prefetch") @@ -184,3 +185,4 @@ INSTANTIATE_CLASS(HDF5DataLayer); REGISTER_LAYER_CLASS(HDF5Data); } // namespace caffe +#endif // USE_HDF5 diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu index 33eebd41..70cd9f32 100644 --- a/src/caffe/layers/hdf5_data_layer.cu +++ b/src/caffe/layers/hdf5_data_layer.cu @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 /* TODO: - only load parts of the file, in accordance with a prototxt param "max_mem" @@ -34,3 +35,4 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer); } // namespace caffe +#endif // USE_HDF5 diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index f8f1edcd..28c453a2 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 #include <vector> #include "hdf5.h" @@ -72,3 +73,4 @@ INSTANTIATE_CLASS(HDF5OutputLayer); REGISTER_LAYER_CLASS(HDF5Output); } // namespace caffe +#endif // USE_HDF5 diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu index c1685cd3..891aea03 100644 --- a/src/caffe/layers/hdf5_output_layer.cu +++ b/src/caffe/layers/hdf5_output_layer.cu @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 #include <vector> #include "hdf5.h" @@ -37,3 +38,4 @@ void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer); } // namespace caffe +#endif // USE_HDF5 diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index e65349f0..be3791e4 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -145,6 +145,8 @@ STUB_GPU(InnerProductLayer); #endif INSTANTIATE_CLASS(InnerProductLayer); +#ifndef USE_ACL REGISTER_LAYER_CLASS(InnerProduct); +#endif } // namespace caffe diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 353c2f95..7bf33e1d 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -5,7 +5,9 @@ #include <utility> #include <vector> +#ifdef USE_HDF5 #include "hdf5.h" +#endif // USE_HDF5 #include "caffe/common.hpp" #include "caffe/layer.hpp" @@ -769,7 +771,12 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) { template <typename Dtype> void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) { +#ifdef USE_HDF5 if (H5Fis_hdf5(trained_filename.c_str())) { +#else + if (trained_filename.size() >= 3 && + trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) { +#endif CopyTrainedLayersFromHDF5(trained_filename); } else { CopyTrainedLayersFromBinaryProto(trained_filename); @@ -786,6 +793,7 @@ void Net<Dtype>::CopyTrainedLayersFromBinaryProto( template <typename Dtype> void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) { +#ifdef USE_HDF5 hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename; @@ -832,6 +840,10 @@ void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) { } H5Gclose(data_hid); H5Fclose(file_hid); +#else + LOG(FATAL) << "CopyTrainedLayersFromHDF5 requires hdf5;" + << " compile with USE_HDF5."; +#endif // USE_HDF5 } template <typename Dtype> @@ -848,6 +860,7 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const { template <typename Dtype> void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const { +#ifdef USE_HDF5 hid_t file_hid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); CHECK_GE(file_hid, 0) @@ -901,6 +914,9 @@ void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const { H5Gclose(diff_hid); } H5Fclose(file_hid); +#else + LOG(FATAL) << "ToHDF5 requires hdf5; compile with USE_HDF5."; +#endif // USE_HDF5 } template <typename Dtype> diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index ad6abe54..725602ab 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -278,6 +278,7 @@ void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto( template <typename Dtype> void SGDSolver<Dtype>::SnapshotSolverStateToHDF5( const string& model_filename) { +#ifdef USE_HDF5 string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate.h5"); LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename; @@ -299,6 +300,10 @@ void SGDSolver<Dtype>::SnapshotSolverStateToHDF5( } H5Gclose(history_hid); H5Fclose(file_hid); +#else + LOG(FATAL) << "SnapshotSolverStateToHDF5 requires hdf5;" + << " compile with USE_HDF5."; +#endif // USE_HDF5 } template <typename Dtype> @@ -323,6 +328,7 @@ void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto( template <typename Dtype> void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) { +#ifdef USE_HDF5 hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file; this->iter_ = hdf5_load_int(file_hid, "iter"); @@ -344,6 +350,10 @@ void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) { } H5Gclose(history_hid); H5Fclose(file_hid); +#else + LOG(FATAL) << "RestoreSolverStateFromHDF5 requires hdf5;" + << " compile with USE_HDF5."; +#endif // USE_HDF5 } INSTANTIATE_CLASS(SGDSolver); diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 88d9b785..d74ad31e 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -54,8 +54,10 @@ inline void SyncedMemory::to_cpu() { caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); head_ = SYNCED; #else +#ifndef USE_ACL NO_GPU; #endif +#endif break; case HEAD_AT_CPU: case SYNCED: @@ -113,9 +115,14 @@ const void* SyncedMemory::gpu_data() { to_gpu(); return (const void*)gpu_ptr_; #else +#ifdef USE_ACL + to_cpu(); + return (const void*)cpu_ptr_; +#else NO_GPU; return NULL; #endif +#endif } void SyncedMemory::set_gpu_data(void* data) { @@ -129,8 +136,14 @@ void SyncedMemory::set_gpu_data(void* data) { head_ = HEAD_AT_GPU; own_gpu_data_ = false; #else +#ifdef USE_ACL + gpu_ptr_ = data; + head_ = HEAD_AT_GPU; + own_gpu_data_ = false; +#else NO_GPU; #endif +#endif } void* SyncedMemory::mutable_cpu_data() { @@ -147,9 +160,15 @@ void* SyncedMemory::mutable_gpu_data() { head_ = HEAD_AT_GPU; return gpu_ptr_; #else +#ifdef USE_ACL + to_cpu(); + head_ = HEAD_AT_GPU; + return cpu_ptr_; +#else NO_GPU; return NULL; #endif +#endif } #ifndef CPU_ONLY diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp index 2bc2de1e..3b0139d5 100644 --- a/src/caffe/test/test_hdf5_output_layer.cpp +++ b/src/caffe/test/test_hdf5_output_layer.cpp @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 #include <string> #include <vector> @@ -121,3 +122,4 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) { } } // namespace caffe +#endif //USE_HDF5
\ No newline at end of file diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp index 487f5176..7d27d77c 100644 --- a/src/caffe/test/test_hdf5data_layer.cpp +++ b/src/caffe/test/test_hdf5data_layer.cpp @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 #include <string> #include <vector> @@ -164,3 +165,4 @@ TYPED_TEST(HDF5DataLayerTest, TestSkip) { } } // namespace caffe +#endif //USE_HDF5
\ No newline at end of file diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp index ed737429..cefd853d 100644 --- a/src/caffe/util/hdf5.cpp +++ b/src/caffe/util/hdf5.cpp @@ -1,3 +1,4 @@ +#ifdef USE_HDF5 #include "caffe/util/hdf5.hpp" #include <string> @@ -207,3 +208,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx) { } } // namespace caffe +#endif // USE_HDF5 diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 71c02274..b0dd3020 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -85,6 +85,9 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) { template <typename Dtype> void caffe_copy(const int N, const Dtype* X, Dtype* Y) { if (X != Y) { +#ifdef USE_ACL + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) +#else if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY // NOLINT_NEXT_LINE(caffe/alt_fn) @@ -95,6 +98,7 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) { } else { memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) } +#endif } } diff --git a/unit_tests/Makefile b/unit_tests/Makefile new file mode 100644 index 00000000..034ee02f --- /dev/null +++ b/unit_tests/Makefile @@ -0,0 +1,87 @@ + +include ../Makefile.config + +CXX_SRCS+=test_pooling_layer.cpp +CXX_SRCS+=test_softmax_layer.cpp +CXX_SRCS+= test_inner_product_layer.cpp +CXX_SRCS+=test_neuron_layer.cpp +CXX_SRCS+=test_lrn_layer.cpp #failed on single channel LRN +#C_SRCS=pmu.c testbed.c +CXX_SRCS+= test_convolution_layer.cpp +#CXX_SRCS= test_fail.cpp +BIN_SRCS=test_caffe_main.cpp test.cpp + +HOME=/home/firefly +# +# caffe related stuff +# + +CAFFE_ROOT=$(HOME)/caffeOnACL +CAFFE_INCS = -I$(CAFFE_ROOT)/include -I$(CAFFE_ROOT)/distribute/include/ +CAFFE_LIBS = -L$(CAFFE_ROOT)/distribute/lib -lcaffe -lglog -lgflags -lprotobuf -lboost_system -lboost_filesystem +CAFFE_RPATH =$(CAFFE_ROOT)/distribute/lib + + +# +# google test related stuff +# + +GTEST_ROOT=/usr/local +GTEST_LIBS=$(GTEST_ROOT)/lib/libgtest_main.a $(GTEST_ROOT)/lib/libgtest.a +GTEST_INCS=-I$(GTEST_ROOT)/include/ + + +# +# arm compute +# + +ACL_ROOT=$(HOME)/ComputeLibrary +ACL_LIBS=-L$(ACL_ROOT)/build -L$(ACL_ROOT)/build/arm_compute -larm_compute -lOpenCL +ACL_RPATH=$(ACL_ROOT)/build:$(ACL_ROOT)/build/arm_compute + +COMM_CFLAGS=$(GTEST_INCS) $(CAFFE_INCS) -Wall -g -DCPU_ONLY -DUSE_ACL +#USE_PROFILING -- get profiling informations, is controled by LOGACL +#LAYER_PERF_STAT -- haitao's net profiling information +ifeq ($(USE_PROFILING), 1) + COMM_CFLAGS += -DUSE_PROFILING -DLAYER_PERF_STAT +endif + +CXXFLAGS=$(COMM_CFLAGS) -Wno-sign-compare +CFLAGS=$(COMM_CFLAGS) + +CC=gcc +CXX=g++ + +COMM_OBJS=$(CXX_SRCS:.cpp=.o) $(C_SRCS:.c=.o) +BIN_OBJS+=$(BIN_SRCS:.cpp=.o) +BIN_EXES=$(BIN_OBJS:.o=) + +LIBS+=$(CAFFE_LIBS) $(GTEST_LIBS) -lpthread -lopenblas $(ACL_LIBS) + +RT_PATH=-Wl,-rpath,$(CAFFE_RPATH):$(ACL_RPATH) + +LDFLAGS+=$(RT_PATH) + + +all : $(BIN_EXES) + +$(BIN_EXES):%:%.o + +$(BIN_EXES):$(COMM_OBJS) + + + +clean: + rm -f $(BIN_EXES) *.o *.so + +.PHONY : all clean + +%.o : %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o : %.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + +%: %.o + $(CXX) $(LDFLAGS) $< $(COMM_OBJS) -o $@ $(LIBS) + diff --git a/unit_tests/pmu.c b/unit_tests/pmu.c new file mode 100644 index 00000000..8c2f6b31 --- /dev/null +++ b/unit_tests/pmu.c @@ -0,0 +1,376 @@ +#include <stdio.h> +#include <stdlib.h> + +#include "pmu.h" + +#define MAX_SLOT_NUM 64 /* defined by ARMv8 SPEC*/ +#define ARMV8_PMCR_N_SHIFT 11 /* Number of counters supported */ +#define ARMV8_PMCR_N_MASK 0x1f + + +static __thread struct pmu_event_record * g_rec_ptr[MAX_SLOT_NUM]; +static __thread int max_counter_slot; + +/* start and stop counter */ + +static void stop_event_profile(struct pmu_event_record * p_record) +{ + int slot=p_record->p_evt->slot; + + p_record->p_evt->enabled=0; + + stop_pmu_counter(slot); +} + + +static void init_pmu_event_record(struct pmu_event * p_evt, struct pmu_event_record * p_record) +{ + struct prof_stat *p_stat; + int i=0; + + p_record->p_evt=p_evt; + p_record->last_val=p_evt->init_val; + p_record->base_val=p_evt->init_val; + p_stat=p_record->prof_stat; + + for(i=0;i<MAX_PROF_POINTS;i++) + { + p_stat[i].prof_seq=i; + p_stat[i].max_val=0; + p_stat[i].min_val=-1U; + p_stat[i].raw_val=0xdeadbeaf; + p_stat[i].cur_val=0; + p_stat[i].total_val=0; + p_stat[i].enter_count=0; + } +} + +static void start_event_profile(struct pmu_event_record * p_record) +{ + int slot=p_record->p_evt->slot; + struct prof_stat *p_stat; + int i; + + p_record->p_evt->enabled=1; + + p_stat=p_record->prof_stat; + + for(i=0;i<MAX_PROF_POINTS;i++) + { + p_stat[i].prof_seq=i; + p_stat[i].max_val=0; + p_stat[i].min_val=-1U; + p_stat[i].raw_val=0xdeadbeaf; + p_stat[i].cur_val=0; + p_stat[i].total_val=0; + p_stat[i].enter_count=0; + } + + write_pmu_counter(slot,p_record->p_evt->init_val); + start_pmu_counter(slot); + +} + +/* create event and profile */ + + +int setup_event_counter(int slot, int event_id) +{ + + if(slot==31) + return 0; + + if(event_id>1023) + return -1; + + write_32bit_sysreg(PMSELR_EL0,slot); + write_32bit_sysreg(PMXEVTYPER_EL0,event_id); + + return 0; +} + + +static struct pmu_event_record * create_pmu_event_record(char *name, int slot, + int event_id, uint32_t init_val, char * note) +{ + struct pmu_event * p_evt; + struct pmu_event_record * p_record; + + if(setup_event_counter(slot,event_id)<0) + return NULL; + + p_evt=malloc(sizeof(struct pmu_event)); + + if(p_evt==NULL) + return NULL; + + p_evt->name=name; + p_evt->slot=slot; + p_evt->event_id=event_id; + p_evt->init_val=init_val; + p_evt->note=note; + p_evt->enabled=0; + + p_record=malloc(sizeof(struct pmu_event_record)); + + if(p_record==NULL) + { + free(p_evt); + return NULL; + } + + p_record->p_evt=p_evt; + + init_pmu_event_record(p_evt,p_record); + + return p_record; +} + + +static void record_event_prof(struct pmu_event_record * p_record, + int prof_seq, int cal_offset, int update_last) +{ + struct prof_stat * p_stat; + uint32_t evt_val; + + evt_val=read_pmu_counter(p_record->p_evt->slot); + + p_stat=&p_record->prof_stat[prof_seq]; + + p_stat->cal_offset=cal_offset; + p_stat->update_last=update_last; + p_stat->raw_val=evt_val; + + if(cal_offset) + p_stat->cur_val=evt_val-p_record->last_val; + else + p_stat->cur_val=evt_val-p_record->base_val; + + if(update_last) + p_record->last_val=evt_val; + + p_stat->total_val+=p_stat->cur_val; + + if(p_stat->cur_val>p_stat->max_val) + p_stat->max_val=p_stat->cur_val; + + if(p_stat->cur_val<p_stat->min_val) + p_stat->min_val=p_stat->cur_val; + + p_stat->enter_count++; +} + + +static void release_pmu_event_record(struct pmu_event_record * p_record) +{ + struct pmu_event * p_evt; + + p_evt=p_record->p_evt; + + if(p_evt->enabled) + stop_pmu_counter(p_evt->slot); + + free(p_evt); + free(p_record); +} + + + + +/* debugging */ + +static void dump_pmu_event(struct pmu_event * p_evt) +{ + + printf("event[%s/0x%x]: slot [%d] init_val[0x%x] enabled[%d]", + p_evt->name,p_evt->event_id,p_evt->slot,p_evt->init_val, + p_evt->enabled); + + if(p_evt->note) + printf(" note[%s]\n",p_evt->note); + else + printf("\n"); +} + +static void dump_pmu_event_record(struct pmu_event_record * p_record) +{ + int i; + struct prof_stat * p_stat; + uint64_t total_avg_val=0; + int count=0; + uint32_t avg; + + printf("------------------------------------------------------------------------\n"); + + dump_pmu_event(p_record->p_evt); + + p_stat=&p_record->prof_stat[0]; + + for(i=0;i<MAX_PROF_POINTS;i++) + { + if(p_stat[i].enter_count==0) + continue; + + avg=(uint32_t)(p_stat[i].total_val/p_stat[i].enter_count); + + printf("stat [%d]: max/min/avg [0x%x/0x%x/0x%x] total [0x%lx] count[%u]\n", + i,p_stat[i].max_val,p_stat[i].min_val, + avg, + p_stat[i].total_val,p_stat[i].enter_count); + printf(" raw_val[0x%x] cal_offset[%d] update_last[%d]\n", + p_stat[i].raw_val,p_stat[i].cal_offset,p_stat[i].update_last); + + count++; + + total_avg_val+=avg; + + + } + + printf("total [%d] points, the sum of average number is: [0x%lx]\n\n",count,total_avg_val); +} + +/* output interface */ + +void init_pmu_registers(void) +{ + /* enabled PMU in PMCR*/ + write_32bit_sysreg(PMCR_EL0,0x1); + max_counter_slot=(read_32bit_sysreg(PMCR_EL0) >> ARMV8_PMCR_N_SHIFT)&ARMV8_PMCR_N_MASK; +} + + +#define dump_32bit_sysreg(reg) \ + printf(__stringify(reg) " is [0x%08x]\n",read_32bit_sysreg(reg)) + +#define dump_64bit_sysreg(reg) \ + printf(__stringify(reg) " is [0x%016llx]\n",read_32bit_sysreg(reg)) + + +void dump_pmu_registers(void) +{ + dump_32bit_sysreg(PMCEID0_EL0); + dump_32bit_sysreg(PMCEID1_EL0); + dump_32bit_sysreg(PMOVSSET_EL0); + dump_32bit_sysreg(PMCR_EL0); + dump_32bit_sysreg(PMUSERENR_EL0); + dump_32bit_sysreg(PMCNTENSET_EL0); +} + +struct pmu_event_record * get_pmu_event_record(int slot) +{ + return g_rec_ptr[slot]; +} + +int create_pmu_event(char *name,int event_id, + uint32_t init_val, char * note) +{ + int i; + + struct pmu_event_record * p_record; + + for(i=0;i<max_counter_slot;i++) + { + if(g_rec_ptr[i]==NULL) + break; + } + + if(i==max_counter_slot) + return -1; + + p_record=create_pmu_event_record(name,i,event_id,init_val,note); + + if(p_record==NULL) + return -1; + + g_rec_ptr[i]=p_record; + + return i; +} + +void release_pmu_event(int slot) +{ + struct pmu_event_record * p_record; + + p_record=g_rec_ptr[slot]; + + if(p_record) + release_pmu_event_record(p_record); + + g_rec_ptr[slot]=NULL; +} + +void start_pmu_event(int slot) +{ + struct pmu_event_record * p_record; + + p_record=g_rec_ptr[slot]; + + start_event_profile(p_record); +} + +void stop_pmu_event(int slot) +{ + struct pmu_event_record * p_record; + + p_record=g_rec_ptr[slot]; + + stop_event_profile(p_record); +} + +void record_pmu_event(int slot, int seq, int cal_offset, int update_last) +{ + struct pmu_event_record * p_record; + + p_record=g_rec_ptr[slot]; + + record_event_prof(p_record,seq,cal_offset,update_last); +} + +void dump_pmu_event_stat(int slot) +{ + struct pmu_event_record * p_record; + + p_record=g_rec_ptr[slot]; + + dump_pmu_event_record(p_record); +} + + +uint32_t get_pmu_stat_avg(int slot) +{ + struct pmu_event_record * p_record; + struct prof_stat * p_stat; + uint32_t total_avg=0; + uint32_t avg; + int i; + + p_record=g_rec_ptr[slot]; + + for(i=0;i<MAX_PROF_POINTS;i++) + { + p_stat=&p_record->prof_stat[i]; + + if(p_stat->enter_count==0) + continue; + avg=p_stat->total_val/p_stat->enter_count; + total_avg+=avg; + } + + return total_avg; +} + +void set_pmu_event_base(int slot) +{ + struct pmu_event_record * p_record; + + uint32_t val; + + p_record=g_rec_ptr[slot]; + + val=read_pmu_counter(slot); + + p_record->last_val=val; + p_record->base_val=val; + +} diff --git a/unit_tests/pmu.h b/unit_tests/pmu.h new file mode 100644 index 00000000..9c252066 --- /dev/null +++ b/unit_tests/pmu.h @@ -0,0 +1,130 @@ +#ifndef AARCH64_PMU_H +#define AARCH64_PMU_H + +#include <stdint.h> + + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define read_32bit_sysreg(reg) \ + ({\ + uint32_t val;\ + __asm__ __volatile__ (\ + "mrs %0," __stringify(reg):"=r"(val));\ + val;\ + }) + + +#define read_64bit_sysreg(reg) \ + ({\ + uint64_t val;\ + __asm__ __volatile__ (\ + "mrs %0," __stringify(reg):"=r"(val));\ + val;\ + }) + +#define write_32bit_sysreg(reg,val) \ + ({\ + __asm__ __volatile__ (\ + "msr " __stringify(reg) " ,%0"::"r"(val));\ + }) + +#define write_64bit_sysreg(reg,val) write_32bit_sysreg(reg,val) + +#define MAX_PROF_POINTS 16 + +struct pmu_event +{ + int slot; + int event_id; + char * name; + uint32_t init_val; + int enabled; + char * note; +}; + +struct prof_stat +{ + int prof_seq; + uint32_t max_val; + uint32_t min_val; + uint32_t cur_val; + uint32_t raw_val; + uint64_t total_val; + uint32_t enter_count; + int cal_offset; + int update_last; +}; + + +struct pmu_event_record +{ + struct pmu_event* p_evt; + uint32_t last_val; + uint32_t base_val; + struct prof_stat prof_stat[MAX_PROF_POINTS]; +}; + +/* all functions in the group must be called on the same CPU */ + +extern void init_pmu_registers(void); +extern void dump_pmu_registers(void); + +/* create one event with event_id, return slot number in success */ +extern int create_pmu_event(char *name,int event_id, + uint32_t init_val, char * note); + +extern void release_pmu_event(int slot); + +extern void start_pmu_event(int slot); + +extern void stop_pmu_event(int slot); + +extern void set_pmu_event_base(int slot); + +extern void record_pmu_event(int slot, int seq, int cal_offset, int update_last); + +extern void dump_pmu_event_stat(int slot); + +extern struct pmu_event_record * get_pmu_event_record(int slot); + +extern uint32_t get_pmu_stat_avg(int slot); /* adding all phase avg together */ + +/* regsiter level interface */ + +extern int setup_event_counter(int slot, int event_id); + +static inline void start_pmu_counter(int slot) +{ + uint32_t mask=1<<slot; + + write_32bit_sysreg(PMCNTENSET_EL0,mask); +} + +static inline void stop_pmu_counter(int slot) +{ + uint32_t mask=1<<slot; + + write_32bit_sysreg(PMCNTENCLR_EL0,mask); +} + + +static inline void write_pmu_counter(int slot,uint32_t val) +{ + write_32bit_sysreg(PMSELR_EL0,slot); + + if(slot<31) + write_32bit_sysreg(PMXEVCNTR_EL0, val); + else + write_64bit_sysreg(PMXEVCNTR_EL0,val); + +} + +static inline uint32_t read_pmu_counter(int slot) +{ + write_32bit_sysreg(PMSELR_EL0,slot); + return read_32bit_sysreg(PMXEVCNTR_EL0); +} + +#endif diff --git a/unit_tests/prof_convolution_layer.cpp b/unit_tests/prof_convolution_layer.cpp new file mode 100644 index 00000000..01f4fcb7 --- /dev/null +++ b/unit_tests/prof_convolution_layer.cpp @@ -0,0 +1,302 @@ +#include <vector> +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/conv_layer.hpp" + +#include <glog/logging.h> + +extern "C" { +#include "testbed.h" +} + + +#define TYPED_TEST(a,b) template <typename TypeParam> void a <TypeParam>:: b (void) +#define EXPECT_NEAR(a,b,c) {} +#define EXPECT_EQ(a,b) {} + +namespace caffe { + +template <typename TypeParam> +struct CPUDevice { + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::CPU; +}; + + +template <typename TypeParam> +class ConvolutionLayerTest { + typedef typename TypeParam::Dtype Dtype; + +public: + + void TestSimpleConvolution(void); + + void TestDilatedConvolution(void); + + void Test0DConvolution(void); + + void TestSimple3DConvolution(void); + + void TestDilated3DConvolution(void); + + void Test1x1Convolution(void); + + void TestSimpleConvolutionGroup(void); + + void TestNDAgainst2D(void); + + void RunConvolution(void); + + ConvolutionLayerTest() + : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)), + blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)), + blob_top_(new Blob<Dtype>()), + blob_top_2_(new Blob<Dtype>()) {} + virtual void SetUp() { + // fill the values + FillerParameter filler_param; + filler_param.set_value(1.); + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + filler.Fill(this->blob_bottom_2_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + + virtual ~ConvolutionLayerTest() { + delete blob_bottom_; + delete blob_bottom_2_; + delete blob_top_; + delete blob_top_2_; + } + + virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) { + this->ref_blob_top_.reset(new Blob<Dtype>()); + this->ref_blob_top_->ReshapeLike(*top); + return this->ref_blob_top_.get(); + } + + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_bottom_2_; + Blob<Dtype>* const blob_top_; + Blob<Dtype>* const blob_top_2_; + shared_ptr<Blob<Dtype> > ref_blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; + shared_ptr<Layer<Dtype> > layer; +}; + +TYPED_TEST(ConvolutionLayerTest, RunConvolution) { + + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); +} + + +TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + layer=shared_ptr<Layer<Dtype> > (new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); +} + + +TYPED_TEST(ConvolutionLayerTest, TestDilatedConvolution) { + typedef typename TypeParam::Dtype Dtype; + vector<int> bottom_shape; + bottom_shape.push_back(2); + bottom_shape.push_back(3); + bottom_shape.push_back(8); + bottom_shape.push_back(7); + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_dilation(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + layer=shared_ptr<Layer<Dtype> > (new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); +} + +TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + const int kNumOutput = 3; + convolution_param->set_num_output(kNumOutput); + convolution_param->set_axis(3); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("gaussian"); + layer=shared_ptr<Layer<Dtype> > ( + new ConvolutionLayer<Dtype>(layer_param)); + vector<int> top_shape = this->blob_bottom_->shape(); + top_shape[3] = kNumOutput; + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(top_shape, this->blob_top_->shape()); +} + +TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + vector<int> bottom_shape(5); + bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0); + bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1); + bottom_shape[2] = 5; + bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2); + bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3); + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + filler.Fill(this->blob_bottom_vec_[i]); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("gaussian"); + layer=shared_ptr<Layer<Dtype> > ( + new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); +} + +TYPED_TEST(ConvolutionLayerTest, TestDilated3DConvolution) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + vector<int> bottom_shape(5); + bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0); + bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1); + bottom_shape[2] = 6; + bottom_shape[3] = 7; + bottom_shape[4] = 8; + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + filler.Fill(this->blob_bottom_vec_[i]); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_dilation(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("gaussian"); + layer=shared_ptr<Layer<Dtype> > ( + new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); +} + +TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(1); + convolution_param->add_stride(1); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + layer=shared_ptr<Layer<Dtype> > ( + new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); +} + +TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(3); + convolution_param->set_group(3); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + layer=shared_ptr<Layer<Dtype> > ( + new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); +} + +} // namespace caffe + +using namespace caffe; +ConvolutionLayerTest<CPUDevice<float> > * g_convptr; + +void single_forward(void * dummy ) +{ + g_convptr->RunConvolution(); +} + +void forward_convolution(void) +{ + run_test(16,0,single_forward,NULL); +} + +#define RUN_FUNC(test_case) test_ ## test_case () + +#define DEF_TEST_FUNC(test_case) \ +void test_## test_case (void)\ +{\ + std::cout<<__FUNCTION__<<" start ..."<<std::endl;\ + g_convptr=new ConvolutionLayerTest<CPUDevice<float> >;\ + g_convptr->SetUp();\ + g_convptr->Test ## test_case ();\ + forward_convolution();\ + delete g_convptr;\ + std::cout<<__FUNCTION__<<" DONE"<<std::endl;\ +} + +DEF_TEST_FUNC(SimpleConvolution) +DEF_TEST_FUNC(DilatedConvolution) +DEF_TEST_FUNC(0DConvolution) +DEF_TEST_FUNC(Simple3DConvolution) +DEF_TEST_FUNC(Dilated3DConvolution) +DEF_TEST_FUNC(1x1Convolution) +DEF_TEST_FUNC(SimpleConvolutionGroup) + + +int main(int argc, char * argv[]) +{ + caffe::GlobalInit(&argc, &argv); + + init_testbed(); + + RUN_FUNC(SimpleConvolution); + RUN_FUNC(DilatedConvolution); + RUN_FUNC(0DConvolution); + RUN_FUNC(Simple3DConvolution); + RUN_FUNC(Dilated3DConvolution); + RUN_FUNC(1x1Convolution); + RUN_FUNC(SimpleConvolutionGroup); + + release_testbed(); + return 0; +} diff --git a/unit_tests/sgemm.cpp b/unit_tests/sgemm.cpp new file mode 100644 index 00000000..11a80a49 --- /dev/null +++ b/unit_tests/sgemm.cpp @@ -0,0 +1,74 @@ +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> + +#include <cblas.h> + +extern "C" { +#include "testbed.h" +} + +float * A; +float * B; +float * C; +int M; +int N; +int K; + + +void init_matrix(float * m, int size) +{ + int i; + for(i=0;i<size;i+=2) + m[i]=i; +} + +void init_data(void) +{ + int sizeA; + int sizeB; + int sizeC; + + sizeA=M*K; + sizeB=K*N; + sizeC=M*N; + + A=(float *)malloc(sizeA*sizeof(float)); + B=(float *)malloc(sizeB*sizeof(float)); + C=(float *)malloc(sizeC*sizeof(float)); + + init_matrix(A,sizeA); + init_matrix(B,sizeB); + init_matrix(C,sizeC); +} + + + +void run_sgemm(void * dummy) +{ + int i; + for(i=0;i<1;i++) + { + cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,M,N,K, + 1.0,A,K,B,N,0,C,N); + } + +} + +int main(int argc, char * argv[]) +{ + + M=27; + K=9; + N=37632; + + + init_data(); + init_testbed(); + + run_test(16,1,run_sgemm,NULL); + + release_testbed(); + + return 0; +} diff --git a/unit_tests/test.cpp b/unit_tests/test.cpp new file mode 100644 index 00000000..80adcc92 --- /dev/null +++ b/unit_tests/test.cpp @@ -0,0 +1,37 @@ +#include "gtest/gtest.h" + + +template <typename TypeParam> +class foo : public ::testing::Test { + +public: + foo(){}; + ~foo(){}; + + TypeParam data; +}; + + +typedef ::testing::Types<int,float > TestDtype; + +TYPED_TEST_CASE(foo,TestDtype); + +TYPED_TEST(foo,test1) +{ + + TypeParam a=10; + + this->data=10; + + EXPECT_EQ(this->data,a); + +} + + +int main(int argc, char * argv[]) +{ + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); + return 0; +} diff --git a/unit_tests/test_caffe_main.cpp b/unit_tests/test_caffe_main.cpp new file mode 100644 index 00000000..ee0dcd4a --- /dev/null +++ b/unit_tests/test_caffe_main.cpp @@ -0,0 +1,34 @@ +#include "caffe/caffe.hpp" +#include "caffe/test/test_caffe_main.hpp" + +extern "C" { +#include "testbed.h" +} + +class testbed_env: public ::testing::Environment { + + public: + testbed_env(){}; + ~testbed_env() {}; + + void SetUp(void) + { + std::cout<<"setting up testbed resource"<<std::endl; + } + + void TearDown(void) + { + std::cout<<"release testbed resource"<<std::endl; + } + +}; + + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + caffe::GlobalInit(&argc, &argv); + + ::testing::AddGlobalTestEnvironment(new testbed_env); + // invoke the test. + return RUN_ALL_TESTS(); +} diff --git a/unit_tests/test_common.cpp b/unit_tests/test_common.cpp new file mode 100644 index 00000000..58ae5c60 --- /dev/null +++ b/unit_tests/test_common.cpp @@ -0,0 +1,64 @@ +#include "gtest/gtest.h" + +#include "caffe/common.hpp" +#include "caffe/syncedmem.hpp" +#include "caffe/util/math_functions.hpp" + +#include "caffe/test/test_caffe_main.hpp" + +namespace caffe { + +class CommonTest : public ::testing::Test {}; + +#ifndef CPU_ONLY // GPU Caffe singleton test. + +TEST_F(CommonTest, TestCublasHandlerGPU) { + int cuda_device_id; + CUDA_CHECK(cudaGetDevice(&cuda_device_id)); + EXPECT_TRUE(Caffe::cublas_handle()); +} + +#endif + +TEST_F(CommonTest, TestBrewMode) { + Caffe::set_mode(Caffe::CPU); + EXPECT_EQ(Caffe::mode(), Caffe::CPU); + Caffe::set_mode(Caffe::GPU); + EXPECT_EQ(Caffe::mode(), Caffe::GPU); +} + +TEST_F(CommonTest, TestRandSeedCPU) { + SyncedMemory data_a(10 * sizeof(int)); + SyncedMemory data_b(10 * sizeof(int)); + Caffe::set_random_seed(1701); + caffe_rng_bernoulli(10, 0.5, static_cast<int*>(data_a.mutable_cpu_data())); + + Caffe::set_random_seed(1701); + caffe_rng_bernoulli(10, 0.5, static_cast<int*>(data_b.mutable_cpu_data())); + + for (int i = 0; i < 10; ++i) { + EXPECT_EQ(static_cast<const int*>(data_a.cpu_data())[i], + static_cast<const int*>(data_b.cpu_data())[i]); + } +} + +#ifndef CPU_ONLY // GPU Caffe singleton test. + +TEST_F(CommonTest, TestRandSeedGPU) { + SyncedMemory data_a(10 * sizeof(unsigned int)); + SyncedMemory data_b(10 * sizeof(unsigned int)); + Caffe::set_random_seed(1701); + CURAND_CHECK(curandGenerate(Caffe::curand_generator(), + static_cast<unsigned int*>(data_a.mutable_gpu_data()), 10)); + Caffe::set_random_seed(1701); + CURAND_CHECK(curandGenerate(Caffe::curand_generator(), + static_cast<unsigned int*>(data_b.mutable_gpu_data()), 10)); + for (int i = 0; i < 10; ++i) { + EXPECT_EQ(((const unsigned int*)(data_a.cpu_data()))[i], + ((const unsigned int*)(data_b.cpu_data()))[i]); + } +} + +#endif + +} // namespace caffe diff --git a/unit_tests/test_convolution_layer.cpp b/unit_tests/test_convolution_layer.cpp new file mode 100644 index 00000000..b2db63ee --- /dev/null +++ b/unit_tests/test_convolution_layer.cpp @@ -0,0 +1,888 @@ +#include <vector> + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/conv_layer.hpp" + +#ifdef USE_CUDNN +#include "caffe/layers/cudnn_conv_layer.hpp" +#endif + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template <typename Dtype> +static void dump_blob(const Blob<Dtype> * blob, const char * outfile) +{ + std::ofstream os; + os.open(outfile); + + for(int i=0;i<blob->LegacyShape(0);i++) + { + os<<"batch: "<<i<<std::endl; + + for(int j=0;j<blob->LegacyShape(1);j++) + { + os<<"channel: "<<j<<std::endl; + + for(int k=0;k<blob->LegacyShape(2);k++) + { + for(int l=0;l<blob->LegacyShape(3);l++) + { + Dtype data=blob->data_at(i,j,k,l); + os<<" "<<data; + } + os<<std::endl; + } + os<<std::endl; + } + + } + + os.close(); + +} + + +template <typename Dtype> +static void fill_blob_data(Blob<Dtype >* bottom, int fixed, float val) +{ + for(int i=0;i<bottom->num();i++) + for(int j=0;j<bottom->channels();j++) + for(int l=0;l<bottom->height();l++) + for(int k=0;k<bottom->width();k++) + { + int offset; + Dtype * ptr; + + offset=i*bottom->channels()*bottom->height()*bottom->width()+ + j*bottom->height()*bottom->width()+ + l*bottom->width()+k; + + ptr=bottom->mutable_cpu_data(); + + if(fixed) + ptr[offset]=val; + else + ptr[offset]=offset; + + } + + +} + + +// Reference convolution for checking results: +// accumulate through explicit loops over input, output, and filters. +template <typename Dtype> +void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param, + const vector<shared_ptr<Blob<Dtype> > >& weights, + Blob<Dtype>* out) { + const bool has_depth = (out->num_axes() == 5); + if (!has_depth) { CHECK_EQ(4, out->num_axes()); } + // Kernel size, stride, and pad + int kernel_h, kernel_w; + if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) { + kernel_h = conv_param->kernel_h(); + kernel_w = conv_param->kernel_w(); + } else { + kernel_h = kernel_w = conv_param->kernel_size(0); + } + int pad_h, pad_w; + if (conv_param->has_pad_h() || conv_param->has_pad_w()) { + pad_h = conv_param->pad_h(); + pad_w = conv_param->pad_w(); + } else { + pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0; + } + int stride_h, stride_w; + if (conv_param->has_stride_h() || conv_param->has_stride_w()) { + stride_h = conv_param->stride_h(); + stride_w = conv_param->stride_w(); + } else { + stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1; + } + int dilation_h, dilation_w; + dilation_h = dilation_w = conv_param->dilation_size() ? + conv_param->dilation(0) : 1; + int kernel_d, pad_d, stride_d, dilation_d; + if (has_depth) { + kernel_d = kernel_h; + stride_d = stride_h; + pad_d = pad_h; + dilation_d = dilation_h; + } else { + kernel_d = stride_d = dilation_d = 1; + pad_d = 0; + } + // Groups + int groups = conv_param->group(); + int o_g = out->shape(1) / groups; + int k_g = in->shape(1) / groups; + int o_head, k_head; + // Convolution + vector<int> weight_offset(4 + has_depth); + vector<int> in_offset(4 + has_depth); + vector<int> out_offset(4 + has_depth); + Dtype* out_data = out->mutable_cpu_data(); + for (int n = 0; n < out->shape(0); n++) { + for (int g = 0; g < groups; g++) { + o_head = o_g * g; + k_head = k_g * g; + for (int o = 0; o < o_g; o++) { + for (int k = 0; k < k_g; k++) { + for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) { + for (int y = 0; y < out->shape(2 + has_depth); y++) { + for (int x = 0; x < out->shape(3 + has_depth); x++) { + for (int r = 0; r < kernel_d; r++) { + for (int p = 0; p < kernel_h; p++) { + for (int q = 0; q < kernel_w; q++) { + int in_z = z * stride_d - pad_d + r * dilation_d; + int in_y = y * stride_h - pad_h + p * dilation_h; + int in_x = x * stride_w - pad_w + q * dilation_w; + if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1) + && in_y >= 0 && in_y < in->shape(2 + has_depth) + && in_x >= 0 && in_x < in->shape(3 + has_depth)) { + weight_offset[0] = o + o_head; + weight_offset[1] = k; + if (has_depth) { weight_offset[2] = r; } + weight_offset[2 + has_depth] = p; + weight_offset[3 + has_depth] = q; + in_offset[0] = n; + in_offset[1] = k + k_head; + if (has_depth) { in_offset[2] = in_z; } + in_offset[2 + has_depth] = in_y; + in_offset[3 + has_depth] = in_x; + out_offset[0] = n; + out_offset[1] = o + o_head; + if (has_depth) { out_offset[2] = z; } + out_offset[2 + has_depth] = y; + out_offset[3 + has_depth] = x; + out_data[out->offset(out_offset)] += + in->data_at(in_offset) + * weights[0]->data_at(weight_offset); + } + } + } + } + } + } + } + } + } + } + } + // Bias + if (conv_param->bias_term()) { + const Dtype* bias_data = weights[1]->cpu_data(); + for (int n = 0; n < out->shape(0); n++) { + for (int o = 0; o < out->shape(1); o++) { + for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) { + for (int y = 0; y < out->shape(2 + has_depth); y++) { + for (int x = 0; x < out->shape(3 + has_depth); x++) { + out_offset[0] = n; + out_offset[1] = o; + if (has_depth) { out_offset[2] = z; } + out_offset[2 + has_depth] = y; + out_offset[3 + has_depth] = x; + out_data[out->offset(out_offset)] += bias_data[o]; + } + } + } + } + } + } +} + +template void caffe_conv(const Blob<float>* in, + ConvolutionParameter* conv_param, + const vector<shared_ptr<Blob<float> > >& weights, + Blob<float>* out); +template void caffe_conv(const Blob<double>* in, + ConvolutionParameter* conv_param, + const vector<shared_ptr<Blob<double> > >& weights, + Blob<double>* out); + +template <typename TypeParam> +class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + + protected: + ConvolutionLayerTest() + : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)), + blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)), + blob_top_(new Blob<Dtype>()), + blob_top_2_(new Blob<Dtype>()) {} + virtual void SetUp() { + // fill the values + FillerParameter filler_param; + filler_param.set_value(1.); + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + filler.Fill(this->blob_bottom_2_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + + virtual ~ConvolutionLayerTest() { + delete blob_bottom_; + delete blob_bottom_2_; + delete blob_top_; + delete blob_top_2_; + } + + virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) { + this->ref_blob_top_.reset(new Blob<Dtype>()); + this->ref_blob_top_->ReshapeLike(*top); + return this->ref_blob_top_.get(); + } + + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_bottom_2_; + Blob<Dtype>* const blob_top_; + Blob<Dtype>* const blob_top_2_; + shared_ptr<Blob<Dtype> > ref_blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; +}; + + +typedef ::testing::Types<CPUDevice<float> > float_only; + +#define TestDtypesAndDevices float_only +TYPED_TEST_CASE(ConvolutionLayerTest, TestDtypesAndDevices); + +TYPED_TEST(ConvolutionLayerTest, TestSetup) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(4); + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->channels(), 4); + EXPECT_EQ(this->blob_top_->height(), 2); + EXPECT_EQ(this->blob_top_->width(), 1); + EXPECT_EQ(this->blob_top_2_->num(), 2); + EXPECT_EQ(this->blob_top_2_->channels(), 4); + EXPECT_EQ(this->blob_top_2_->height(), 2); + EXPECT_EQ(this->blob_top_2_->width(), 1); + // setting group should not change the shape + convolution_param->set_num_output(3); + convolution_param->set_group(3); + layer.reset(new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->channels(), 3); + EXPECT_EQ(this->blob_top_->height(), 2); + EXPECT_EQ(this->blob_top_->width(), 1); + EXPECT_EQ(this->blob_top_2_->num(), 2); + EXPECT_EQ(this->blob_top_2_->channels(), 3); + EXPECT_EQ(this->blob_top_2_->height(), 2); + EXPECT_EQ(this->blob_top_2_->width(), 1); +} + + +TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) { + typedef typename TypeParam::Dtype Dtype; + + + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(3); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + + vector<int> bottom_shape; + bottom_shape.push_back(2); + bottom_shape.push_back(3); + bottom_shape.push_back(5); + bottom_shape.push_back(5); + + this->blob_bottom_->Reshape(bottom_shape); + this->blob_bottom_2_->Reshape(bottom_shape); + + fill_blob_data(this->blob_bottom_,0,1); + fill_blob_data(this->blob_bottom_2_,1,1); + + layer_param.set_type("Convolution"); + + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + //fill_blob_data(layer->blobs()[0].get(),1,1); + + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + +#ifdef LAYER_PERF_STAT + perf_stat * p_time_stat; + p_time_stat=layer->get_time_stat(); + std::cout<<"start: "<<p_time_stat->start; + std::cout<<" end: "<<p_time_stat->end; + std::cout<<" used: "<<p_time_stat->used; + std::cout<<" total: "<<p_time_stat->total; + std::cout<<" count: "<<p_time_stat->count<<std::endl; +#endif + + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } + + +// dump_blob(this->blob_bottom_,"bottom.data"); +// dump_blob(this->blob_top_,"top.data"); +// dump_blob(this->ref_blob_top_.get(),"reftop.data"); +// dump_blob(layer->blobs()[0].get(),"weight.data"); +// dump_blob(layer->blobs()[1].get(),"bias.data"); +#if 1 + caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_2_)); + top_data = this->blob_top_2_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } +#endif +} + +#if 0 + +TYPED_TEST(ConvolutionLayerTest, TestDilatedConvolution) { + typedef typename TypeParam::Dtype Dtype; + vector<int> bottom_shape; + bottom_shape.push_back(2); + bottom_shape.push_back(3); + bottom_shape.push_back(8); + bottom_shape.push_back(7); + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_dilation(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } + caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_2_)); + top_data = this->blob_top_2_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } +} + +TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + const int kNumOutput = 3; + convolution_param->set_num_output(kNumOutput); + convolution_param->set_axis(3); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("gaussian"); + + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + vector<int> top_shape = this->blob_bottom_->shape(); + top_shape[3] = kNumOutput; + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(top_shape, this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + vector<int> weight_offset(2); + const Blob<Dtype>* weight = layer->blobs()[0].get(); + const Blob<Dtype>* bias = layer->blobs()[1].get(); + const int num = this->blob_top_->count(3); + const int dim = this->blob_top_->shape(3); + const int bottom_dim = this->blob_bottom_->shape(3); + for (int n = 0; n < num; ++n) { + for (int d = 0; d < dim; ++d) { + weight_offset[0] = d; + Dtype value = bias->cpu_data()[d]; + for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) { + weight_offset[1] = bottom_d; + value += weight->data_at(weight_offset) * + this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d]; + } + EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4); + } + } +} + +TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + vector<int> bottom_shape(5); + bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0); + bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1); + bottom_shape[2] = 5; + bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2); + bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3); + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + filler.Fill(this->blob_bottom_vec_[i]); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("gaussian"); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } + caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_2_)); + top_data = this->blob_top_2_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } +} + +TYPED_TEST(ConvolutionLayerTest, TestDilated3DConvolution) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + vector<int> bottom_shape(5); + bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0); + bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1); + bottom_shape[2] = 6; + bottom_shape[3] = 7; + bottom_shape[4] = 8; + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + filler.Fill(this->blob_bottom_vec_[i]); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_dilation(2); + convolution_param->set_num_output(4); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("gaussian"); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } + caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_2_)); + top_data = this->blob_top_2_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } +} + +#endif + +TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); +#if 0 + convolution_param->add_kernel_size(1); + convolution_param->set_num_output(2); + + vector<int> bottom_shape; + bottom_shape.push_back(1); + bottom_shape.push_back(32); + bottom_shape.push_back(133); + bottom_shape.push_back(98); + + this->blob_bottom_vec_[0]->Reshape(bottom_shape); + +#else + convolution_param->add_kernel_size(1); + convolution_param->add_stride(1); + convolution_param->set_num_output(4); +#endif + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(1); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + fill_blob_data(this->blob_bottom_,1,1); + fill_blob_data(layer->blobs()[0].get(),1,1); + + + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + fill_blob_data(this->blob_bottom_,1,3); + + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + // std::cout<<i<<": "<< top_data[i]<<" "<<ref_top_data[i]<<std::endl; + } +} + + +#if 0 +TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(3); + convolution_param->set_group(3); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } +} + +TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) { + // Test separable convolution by computing the Sobel operator + // as a single filter then comparing the result + // as the convolution of two rectangular filters. + typedef typename TypeParam::Dtype Dtype; + // Fill bottoms with identical Gaussian noise. + shared_ptr<GaussianFiller<Dtype> > filler; + FillerParameter filler_param; + filler_param.set_value(1.); + filler.reset(new GaussianFiller<Dtype>(filler_param)); + filler->Fill(this->blob_bottom_); + this->blob_bottom_2_->CopyFrom(*this->blob_bottom_); + // Compute Sobel G_x operator as 3 x 3 convolution. + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(1); + convolution_param->set_bias_term(false); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + + layer->blobs().resize(1); + layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 3)); + Dtype* weights = layer->blobs()[0]->mutable_cpu_data(); + for (int c = 0; c < 3; ++c) { + int i = c * 9; // 3 x 3 filter + weights[i + 0] = -1; + weights[i + 1] = 0; + weights[i + 2] = 1; + weights[i + 3] = -2; + weights[i + 4] = 0; + weights[i + 5] = 2; + weights[i + 6] = -1; + weights[i + 7] = 0; + weights[i + 8] = 1; + } + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions. + // (1) the [1 2 1] column filter + vector<Blob<Dtype>*> sep_blob_bottom_vec; + vector<Blob<Dtype>*> sep_blob_top_vec; + shared_ptr<Blob<Dtype> > blob_sep(new Blob<Dtype>()); + sep_blob_bottom_vec.push_back(this->blob_bottom_2_); + sep_blob_top_vec.push_back(this->blob_top_2_); + convolution_param->clear_kernel_size(); + convolution_param->clear_stride(); + convolution_param->set_kernel_h(3); + convolution_param->set_kernel_w(1); + convolution_param->set_stride_h(2); + convolution_param->set_stride_w(1); + convolution_param->set_num_output(1); + convolution_param->set_bias_term(false); + layer.reset(new ConvolutionLayer<Dtype>(layer_param)); + layer->blobs().resize(1); + layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 1)); + Dtype* weights_1 = layer->blobs()[0]->mutable_cpu_data(); + for (int c = 0; c < 3; ++c) { + int i = c * 3; // 3 x 1 filter + weights_1[i + 0] = 1; + weights_1[i + 1] = 2; + weights_1[i + 2] = 1; + } + layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec); + layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec); + // (2) the [-1 0 1] row filter + blob_sep->CopyFrom(*this->blob_top_2_, false, true); + sep_blob_bottom_vec.clear(); + sep_blob_bottom_vec.push_back(blob_sep.get()); + convolution_param->set_kernel_h(1); + convolution_param->set_kernel_w(3); + convolution_param->set_stride_h(1); + convolution_param->set_stride_w(2); + convolution_param->set_num_output(1); + convolution_param->set_bias_term(false); + layer.reset(new ConvolutionLayer<Dtype>(layer_param)); + layer->blobs().resize(1); + layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3)); + Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data(); + weights_2[0] = -1; + weights_2[1] = 0; + weights_2[2] = 1; + layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec); + layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec); + // Test equivalence of full and separable filters. + const Dtype* top_data = this->blob_top_->cpu_data(); + const Dtype* sep_top_data = this->blob_top_2_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], sep_top_data[i], 1e-4); + } +} + +TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) { + typedef typename TypeParam::Dtype Dtype; + const int kernel_h = 11; + const int kernel_w = 13; + vector<int> bottom_shape(4); + bottom_shape[0] = 15; + bottom_shape[1] = 18; + bottom_shape[2] = kernel_h * 2; + bottom_shape[3] = kernel_w * 2; + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) { + this->blob_bottom_vec_[i]->Reshape(bottom_shape); + filler.Fill(this->blob_bottom_vec_[i]); + } + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->set_num_output(12); + convolution_param->set_bias_term(false); + convolution_param->set_group(6); + convolution_param->set_kernel_h(kernel_h); + convolution_param->set_kernel_w(kernel_w); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + Blob<Dtype> weights; + Blob<Dtype> top_diff; + // Shape and fill weights and top_diff. + bool copy_diff; + bool reshape; + { + ConvolutionLayer<Dtype> layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + top_diff.ReshapeLike(*this->blob_top_); + filler.Fill(&top_diff); + ASSERT_EQ(1, layer.blobs().size()); + copy_diff = false; reshape = true; + weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape); + } + vector<bool> propagate_down(1, true); + Blob<Dtype> result_2d; + Blob<Dtype> backward_result_2d; + Blob<Dtype> backward_weight_result_2d; + // Test with 2D im2col + { + caffe_set(this->blob_top_->count(), Dtype(0), + this->blob_top_->mutable_cpu_data()); + caffe_set(this->blob_bottom_->count(), Dtype(0), + this->blob_bottom_->mutable_cpu_diff()); + caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff()); + // Do SetUp and Forward; save Forward result in result_2d. + convolution_param->set_force_nd_im2col(false); + + layer_param.set_type("Convolution"); + + Layer<Dtype> & layer_2d=*LayerRegistry<Dtype>::CreateLayer(layer_param); + + + layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(1, layer_2d.blobs().size()); + copy_diff = false; reshape = false; + layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape); + layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + copy_diff = false; reshape = true; + result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape); + // Copy pre-generated top diff into actual top diff; + // do Backward and save result in backward_result_2d. + ASSERT_EQ(this->blob_top_->shape(), top_diff.shape()); + caffe_copy(top_diff.count(), top_diff.cpu_data(), + this->blob_top_->mutable_cpu_diff()); + layer_2d.Backward(this->blob_top_vec_, propagate_down, + this->blob_bottom_vec_); + copy_diff = true; reshape = true; + backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape); + backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape); + } + Blob<Dtype> result_nd; + Blob<Dtype> backward_result_nd; + Blob<Dtype> backward_weight_result_nd; + // Test with ND im2col + { + caffe_set(this->blob_top_->count(), Dtype(0), + this->blob_top_->mutable_cpu_data()); + caffe_set(this->blob_bottom_->count(), Dtype(0), + this->blob_bottom_->mutable_cpu_diff()); + caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff()); + // Do SetUp and Forward; save Forward result in result_nd. + convolution_param->set_force_nd_im2col(true); + + layer_param.set_type("Convolution"); + + Layer<Dtype>& layer_nd=*LayerRegistry<Dtype>::CreateLayer(layer_param); + + layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(1, layer_nd.blobs().size()); + copy_diff = false; reshape = false; + layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape); + layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + copy_diff = false; reshape = true; + result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape); + // Copy pre-generated top diff into actual top diff; + // do Backward and save result in backward_result_nd. + ASSERT_EQ(this->blob_top_->shape(), top_diff.shape()); + caffe_copy(top_diff.count(), top_diff.cpu_data(), + this->blob_top_->mutable_cpu_diff()); + layer_nd.Backward(this->blob_top_vec_, propagate_down, + this->blob_bottom_vec_); + copy_diff = true; reshape = true; + backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape); + backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape); + } + ASSERT_EQ(result_nd.count(), result_2d.count()); + for (int i = 0; i < result_2d.count(); ++i) { + EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]); + } + ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count()); + for (int i = 0; i < backward_result_2d.count(); ++i) { + EXPECT_EQ(backward_result_2d.cpu_diff()[i], + backward_result_nd.cpu_diff()[i]); + } + ASSERT_EQ(backward_weight_result_nd.count(), + backward_weight_result_2d.count()); + for (int i = 0; i < backward_weight_result_2d.count(); ++i) { + EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i], + backward_weight_result_nd.cpu_diff()[i]); + } +} +#endif + + +} // namespace caffe diff --git a/unit_tests/test_fail.cpp b/unit_tests/test_fail.cpp new file mode 100644 index 00000000..0bc54954 --- /dev/null +++ b/unit_tests/test_fail.cpp @@ -0,0 +1,419 @@ +#include <vector> + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/conv_layer.hpp" + +#ifdef USE_CUDNN +#include "caffe/layers/cudnn_conv_layer.hpp" +#endif + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +#include <iomanip> + +namespace caffe { + +template <typename Dtype> +void dump_blob(const Blob<Dtype> * blob, const char * outfile) +{ + std::ofstream os; + os.open(outfile); + + os<<setiosflags(ios::fixed); + + for(int i=0;i<blob->LegacyShape(0);i++) + { + + for(int j=0;j<blob->LegacyShape(1);j++) + { + + for(int k=0;k<blob->LegacyShape(2);k++) + { + for(int l=0;l<blob->LegacyShape(3);l++) + { + Dtype data=blob->data_at(i,j,k,l); + os<<std::setprecision(12)<<data<<", "; + } + os<<std::endl; + } + os<<std::endl; + } + os<<std::endl; + } + + os.close(); + +} + + +template <typename Dtype> +void fill_blob_data(Blob<Dtype >* bottom, int fixed, float val) +{ + for(int i=0;i<bottom->num();i++) + for(int j=0;j<bottom->channels();j++) + for(int l=0;l<bottom->height();l++) + for(int k=0;k<bottom->width();k++) + { + int offset; + Dtype * ptr; + + offset=i*bottom->channels()*bottom->height()*bottom->width()+ + j*bottom->height()*bottom->width()+ + l*bottom->width()+k; + + ptr=bottom->mutable_cpu_data(); + + if(fixed) + ptr[offset]=val; + else + ptr[offset]=offset+100; + + } + + +} + + +template <typename Dtype> +void load_blob_data(Blob<Dtype >* bottom, Dtype * p_data) +{ + for(int i=0;i<bottom->num();i++) + for(int j=0;j<bottom->channels();j++) + for(int l=0;l<bottom->height();l++) + for(int k=0;k<bottom->width();k++) + { + int offset; + Dtype * ptr; + + offset=i*bottom->channels()*bottom->height()*bottom->width()+ + j*bottom->height()*bottom->width()+ + l*bottom->width()+k; + + ptr=bottom->mutable_cpu_data(); + + ptr[offset]=p_data[offset]; + + } + +} + + + +// Reference convolution for checking results: +// accumulate through explicit loops over input, output, and filters. +template <typename Dtype> +void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param, + const vector<shared_ptr<Blob<Dtype> > >& weights, + Blob<Dtype>* out) { + const bool has_depth = (out->num_axes() == 5); + if (!has_depth) { CHECK_EQ(4, out->num_axes()); } + // Kernel size, stride, and pad + int kernel_h, kernel_w; + if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) { + kernel_h = conv_param->kernel_h(); + kernel_w = conv_param->kernel_w(); + } else { + kernel_h = kernel_w = conv_param->kernel_size(0); + } + int pad_h, pad_w; + if (conv_param->has_pad_h() || conv_param->has_pad_w()) { + pad_h = conv_param->pad_h(); + pad_w = conv_param->pad_w(); + } else { + pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0; + } + int stride_h, stride_w; + if (conv_param->has_stride_h() || conv_param->has_stride_w()) { + stride_h = conv_param->stride_h(); + stride_w = conv_param->stride_w(); + } else { + stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1; + } + int dilation_h, dilation_w; + dilation_h = dilation_w = conv_param->dilation_size() ? + conv_param->dilation(0) : 1; + int kernel_d, pad_d, stride_d, dilation_d; + if (has_depth) { + kernel_d = kernel_h; + stride_d = stride_h; + pad_d = pad_h; + dilation_d = dilation_h; + } else { + kernel_d = stride_d = dilation_d = 1; + pad_d = 0; + } + // Groups + int groups = conv_param->group(); + int o_g = out->shape(1) / groups; + int k_g = in->shape(1) / groups; + int o_head, k_head; + // Convolution + vector<int> weight_offset(4 + has_depth); + vector<int> in_offset(4 + has_depth); + vector<int> out_offset(4 + has_depth); + Dtype* out_data = out->mutable_cpu_data(); + for (int n = 0; n < out->shape(0); n++) { + for (int g = 0; g < groups; g++) { + o_head = o_g * g; + k_head = k_g * g; + for (int o = 0; o < o_g; o++) { + for (int k = 0; k < k_g; k++) { + for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) { + for (int y = 0; y < out->shape(2 + has_depth); y++) { + for (int x = 0; x < out->shape(3 + has_depth); x++) { + for (int r = 0; r < kernel_d; r++) { + for (int p = 0; p < kernel_h; p++) { + for (int q = 0; q < kernel_w; q++) { + int in_z = z * stride_d - pad_d + r * dilation_d; + int in_y = y * stride_h - pad_h + p * dilation_h; + int in_x = x * stride_w - pad_w + q * dilation_w; + if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1) + && in_y >= 0 && in_y < in->shape(2 + has_depth) + && in_x >= 0 && in_x < in->shape(3 + has_depth)) { + weight_offset[0] = o + o_head; + weight_offset[1] = k; + if (has_depth) { weight_offset[2] = r; } + weight_offset[2 + has_depth] = p; + weight_offset[3 + has_depth] = q; + in_offset[0] = n; + in_offset[1] = k + k_head; + if (has_depth) { in_offset[2] = in_z; } + in_offset[2 + has_depth] = in_y; + in_offset[3 + has_depth] = in_x; + out_offset[0] = n; + out_offset[1] = o + o_head; + if (has_depth) { out_offset[2] = z; } + out_offset[2 + has_depth] = y; + out_offset[3 + has_depth] = x; + out_data[out->offset(out_offset)] += + in->data_at(in_offset) + * weights[0]->data_at(weight_offset); + } + } + } + } + } + } + } + } + } + } + } + // Bias + if (conv_param->bias_term()) { + const Dtype* bias_data = weights[1]->cpu_data(); + for (int n = 0; n < out->shape(0); n++) { + for (int o = 0; o < out->shape(1); o++) { + for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) { + for (int y = 0; y < out->shape(2 + has_depth); y++) { + for (int x = 0; x < out->shape(3 + has_depth); x++) { + out_offset[0] = n; + out_offset[1] = o; + if (has_depth) { out_offset[2] = z; } + out_offset[2 + has_depth] = y; + out_offset[3 + has_depth] = x; + out_data[out->offset(out_offset)] += bias_data[o]; + } + } + } + } + } + } +} + +template void caffe_conv(const Blob<float>* in, + ConvolutionParameter* conv_param, + const vector<shared_ptr<Blob<float> > >& weights, + Blob<float>* out); +template void caffe_conv(const Blob<double>* in, + ConvolutionParameter* conv_param, + const vector<shared_ptr<Blob<double> > >& weights, + Blob<double>* out); + +template <typename TypeParam> +class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + + protected: + ConvolutionLayerTest() + : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)), + blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)), + blob_top_(new Blob<Dtype>()), + blob_top_2_(new Blob<Dtype>()) {} + virtual void SetUp() { + // fill the values + FillerParameter filler_param; + filler_param.set_value(1.); + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + filler.Fill(this->blob_bottom_2_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + + virtual ~ConvolutionLayerTest() { + delete blob_bottom_; + delete blob_bottom_2_; + delete blob_top_; + delete blob_top_2_; + } + + virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) { + this->ref_blob_top_.reset(new Blob<Dtype>()); + this->ref_blob_top_->ReshapeLike(*top); + return this->ref_blob_top_.get(); + } + + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_bottom_2_; + Blob<Dtype>* const blob_top_; + Blob<Dtype>* const blob_top_2_; + shared_ptr<Blob<Dtype> > ref_blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; +}; + + +typedef ::testing::Types<CPUDevice<float> > float_only; + +#define TestDtypesAndDevices float_only +TYPED_TEST_CASE(ConvolutionLayerTest, TestDtypesAndDevices); + +TYPED_TEST(ConvolutionLayerTest, TestSetup) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(2); + convolution_param->set_num_output(4); + this->blob_bottom_vec_.push_back(this->blob_bottom_2_); + this->blob_top_vec_.push_back(this->blob_top_2_); + + layer_param.set_type("Convolution"); + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->channels(), 4); + EXPECT_EQ(this->blob_top_->height(), 2); + EXPECT_EQ(this->blob_top_->width(), 1); + EXPECT_EQ(this->blob_top_2_->num(), 2); + EXPECT_EQ(this->blob_top_2_->channels(), 4); + EXPECT_EQ(this->blob_top_2_->height(), 2); + EXPECT_EQ(this->blob_top_2_->width(), 1); + // setting group should not change the shape + convolution_param->set_num_output(3); + convolution_param->set_group(3); + layer.reset(new ConvolutionLayer<Dtype>(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->channels(), 3); + EXPECT_EQ(this->blob_top_->height(), 2); + EXPECT_EQ(this->blob_top_->width(), 1); + EXPECT_EQ(this->blob_top_2_->num(), 2); + EXPECT_EQ(this->blob_top_2_->channels(), 3); + EXPECT_EQ(this->blob_top_2_->height(), 2); + EXPECT_EQ(this->blob_top_2_->width(), 1); +} + +float fail3_weight[]={ +-0.850632905960, -1.578843951225, -0.890021681786, +0.971448659897, -0.538104891777, 0.233876436949, +-1.242745161057, 2.211859703064, 0.525026142597, + +-1.726792931557, -1.194667577744, 1.119420289993, +-1.539444208145, 1.725312829018, -1.573384165764, +0.519557833672, 0.376551657915, -0.615215837955, + +0.758795797825, -0.498177528381, 0.254181325436, +-0.071698464453, -1.192728281021, 0.776199519634, +1.837580919266, -0.478745609522, -0.804457962513, + + +-2.220808744431, -0.892578184605, -1.422935843468, +-1.707052111626, -1.837757468224, -1.312300324440, +-1.251585721970, -1.591378808022, -0.577652215958, + +1.727164268494, 0.176050186157, -1.804216146469, +0.547152698040, -0.024264926091, -2.040683984756, +-2.159983396530, 1.692966818810, -1.558626413345, + +-1.242013096809, 0.122898645699, -0.146973758936, +-0.405744194984, -1.716119289398, 1.215066313744, +1.061164021492, -0.705341339111, -0.245370775461, + + +0.781007647514, -0.104610890150, 2.421228170395, +0.348720043898, 0.289468020201, 1.841132760048, +-0.835199236870, -0.242239400744, 1.169079542160, + +0.165550187230, -0.418082803488, 0.479667782784, +-0.241552516818, 0.767971694469, -0.760977804661, +-2.419095993042, 0.774254024029, 0.541432976723, + +0.855292022228, -0.144438281655, 0.251998007298, +-0.242634430528, -0.044748753309, -0.321820944548, +-0.487676948309, -0.761075556278, -0.646164357662 +}; + +TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) { + typedef typename TypeParam::Dtype Dtype; + + LayerParameter layer_param; + ConvolutionParameter* convolution_param = + layer_param.mutable_convolution_param(); + convolution_param->add_kernel_size(3); + convolution_param->add_stride(1); + convolution_param->set_num_output(3); + convolution_param->mutable_weight_filler()->set_type("gaussian"); + convolution_param->mutable_bias_filler()->set_type("constant"); + convolution_param->mutable_bias_filler()->set_value(0.1); + + vector<int> bottom_shape; + bottom_shape.push_back(1); + bottom_shape.push_back(3); + bottom_shape.push_back(5); + bottom_shape.push_back(5); + + this->blob_bottom_->Reshape(bottom_shape); + + fill_blob_data(this->blob_bottom_,0,1); + + layer_param.set_type("Convolution"); + + shared_ptr<Layer<Dtype> > layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + //fill_blob_data(layer->blobs()[0].get(),1,1); + load_blob_data(layer->blobs()[0].get(),fail3_weight); + + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Check against reference convolution. + const Dtype* top_data; + const Dtype* ref_top_data; + caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(), + this->MakeReferenceTop(this->blob_top_)); + top_data = this->blob_top_->cpu_data(); + ref_top_data = this->ref_blob_top_->cpu_data(); + for (int i = 0; i < this->blob_top_->count(); ++i) { + EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4); + } + + + dump_blob(this->blob_bottom_,"bottom.data"); + dump_blob(this->blob_top_,"top.data"); + dump_blob(this->ref_blob_top_.get(),"reftop.data"); + dump_blob(layer->blobs()[0].get(),"weight.data"); + dump_blob(layer->blobs()[1].get(),"bias.data"); +} + +} diff --git a/unit_tests/test_inner_product_layer.cpp b/unit_tests/test_inner_product_layer.cpp new file mode 100644 index 00000000..efe3d87f --- /dev/null +++ b/unit_tests/test_inner_product_layer.cpp @@ -0,0 +1,295 @@ +#include <vector> + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/inner_product_layer.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +#ifndef CPU_ONLY +extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +#endif + +template <typename TypeParam> +class InnerProductLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + protected: + InnerProductLayerTest() + : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)), + blob_bottom_nobatch_(new Blob<Dtype>(1, 2, 3, 4)), + blob_top_(new Blob<Dtype>()) { + // fill the values + FillerParameter filler_param; + UniformFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~InnerProductLayerTest() { + delete blob_bottom_; + delete blob_bottom_nobatch_; + delete blob_top_; + } + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_bottom_nobatch_; + Blob<Dtype>* const blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; +}; + +typedef ::testing::Types<CPUDevice<float> > float_only; + +#define TestDtypesAndDevices float_only + +TYPED_TEST_CASE(InnerProductLayerTest, TestDtypesAndDevices); + +TYPED_TEST(InnerProductLayerTest, TestSetUp) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + + layer_param.set_type("InnerProduct"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<InnerProductLayer<Dtype> > layer= + boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->height(), 1); + EXPECT_EQ(this->blob_top_->width(), 1); + EXPECT_EQ(this->blob_top_->channels(), 10); +} + +/** @brief TestSetUp while toggling transpose flag + */ +TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeFalse) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->set_transpose(false); + + layer_param.set_type("InnerProduct"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<InnerProductLayer<Dtype> > layer= + boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer); + + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(2, this->blob_top_->num()); + EXPECT_EQ(1, this->blob_top_->height()); + EXPECT_EQ(1, this->blob_top_->width()); + EXPECT_EQ(10, this->blob_top_->channels()); + EXPECT_EQ(2, layer->blobs()[0]->num_axes()); + EXPECT_EQ(10, layer->blobs()[0]->shape(0)); + EXPECT_EQ(60, layer->blobs()[0]->shape(1)); +} + +/** @brief TestSetUp while toggling transpose flag + */ +TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeTrue) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->set_transpose(true); + + layer_param.set_type("InnerProduct"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<InnerProductLayer<Dtype> > layer= + boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(2, this->blob_top_->num()); + EXPECT_EQ(1, this->blob_top_->height()); + EXPECT_EQ(1, this->blob_top_->width()); + EXPECT_EQ(10, this->blob_top_->channels()); + EXPECT_EQ(2, layer->blobs()[0]->num_axes()); + EXPECT_EQ(60, layer->blobs()[0]->shape(0)); + EXPECT_EQ(10, layer->blobs()[0]->shape(1)); +} + +TYPED_TEST(InnerProductLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + bool IS_VALID_CUDA = false; +#ifndef CPU_ONLY + IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; +#endif + if (Caffe::mode() == Caffe::CPU || + sizeof(Dtype) == 4 || IS_VALID_CUDA) { + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->mutable_weight_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_min(1); + inner_product_param->mutable_bias_filler()->set_max(2); + + layer_param.set_type("InnerProduct"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<InnerProductLayer<Dtype> > layer= + boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* data = this->blob_top_->cpu_data(); + const int count = this->blob_top_->count(); + for (int i = 0; i < count; ++i) { + EXPECT_GE(data[i], 1.); + } + } else { + LOG(ERROR) << "Skipping test due to old architecture."; + } +} + +/** + * @brief Init. an IP layer without transpose + random weights, + * run Forward, save the result. + * Init. another IP layer with transpose. + * manually copy and transpose the weights from the first IP layer, + * then run Forward on the same input and check that the result is the same + */ +TYPED_TEST(InnerProductLayerTest, TestForwardTranspose) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + bool IS_VALID_CUDA = false; +#ifndef CPU_ONLY + IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; +#endif + if (Caffe::mode() == Caffe::CPU || + sizeof(Dtype) == 4 || IS_VALID_CUDA) { + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->mutable_weight_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_min(1); + inner_product_param->mutable_bias_filler()->set_max(2); + inner_product_param->set_transpose(false); + + layer_param.set_type("InnerProduct"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<InnerProductLayer<Dtype> > layer= + boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const int count = this->blob_top_->count(); + Blob<Dtype>* const top = new Blob<Dtype>(); + top->ReshapeLike(*this->blob_top_); + caffe_copy(count, this->blob_top_->cpu_data(), top->mutable_cpu_data()); + this->blob_top_vec_.clear(); + this->blob_top_vec_.push_back(new Blob<Dtype>()); + inner_product_param->set_transpose(true); + shared_ptr<InnerProductLayer<Dtype> > ip_t( + new InnerProductLayer<Dtype>(layer_param)); + ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const int count_w = layer->blobs()[0]->count(); + EXPECT_EQ(count_w, ip_t->blobs()[0]->count()); + // manually copy and transpose the weights from 1st IP layer into 2nd + const Dtype* w = layer->blobs()[0]->cpu_data(); + Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data(); + const int width = layer->blobs()[0]->shape(1); + const int width_t = ip_t->blobs()[0]->shape(1); + for (int i = 0; i < count_w; ++i) { + int r = i / width; + int c = i % width; + w_t[c*width_t+r] = w[r*width+c]; // copy while transposing + } + // copy bias from 1st IP layer to 2nd IP layer + ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count()); + caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(), + ip_t->blobs()[1]->mutable_cpu_data()); + ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(count, this->blob_top_->count()) + << "Invalid count for top blob for IP with transpose."; + Blob<Dtype>* const top_t = new Blob<Dtype>();\ + top_t->ReshapeLike(*this->blob_top_vec_[0]); + caffe_copy(count, + this->blob_top_vec_[0]->cpu_data(), + top_t->mutable_cpu_data()); + const Dtype* data = top->cpu_data(); + const Dtype* data_t = top_t->cpu_data(); + for (int i = 0; i < count; ++i) { + EXPECT_FLOAT_EQ(data[i], data_t[i]); + } + } else { + LOG(ERROR) << "Skipping test due to old architecture."; + } +} + +TYPED_TEST(InnerProductLayerTest, TestForwardNoBatch) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_nobatch_); + bool IS_VALID_CUDA = false; +#ifndef CPU_ONLY + IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; +#endif + if (Caffe::mode() == Caffe::CPU || + sizeof(Dtype) == 4 || IS_VALID_CUDA) { + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->mutable_weight_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_min(1); + inner_product_param->mutable_bias_filler()->set_max(2); + + layer_param.set_type("InnerProduct"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<InnerProductLayer<Dtype> > layer= + boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* data = this->blob_top_->cpu_data(); + const int count = this->blob_top_->count(); + for (int i = 0; i < count; ++i) { + EXPECT_GE(data[i], 1.); + } + } else { + LOG(ERROR) << "Skipping test due to old architecture."; + } +} + + +} // namespace caffe diff --git a/unit_tests/test_lrn_layer.cpp b/unit_tests/test_lrn_layer.cpp new file mode 100644 index 00000000..22ae8c64 --- /dev/null +++ b/unit_tests/test_lrn_layer.cpp @@ -0,0 +1,344 @@ +#include <algorithm> +#include <vector> +#include <iostream> + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/lrn_layer.hpp" + +#ifdef USE_CUDNN +#include "caffe/layers/cudnn_lcn_layer.hpp" +#include "caffe/layers/cudnn_lrn_layer.hpp" +#endif + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +using std::min; +using std::max; + +int test_h=5; +int test_w=5; + +namespace caffe { + + +template <typename Dtype> +static void dump_blob(const Blob<Dtype> * blob, const char * outfile) +{ + std::ofstream os; + os.open(outfile); + + for(int i=0;i<blob->shape(0);i++) + for(int j=0;j<blob->shape(1);j++) + for(int k=0;k<blob->shape(2);k++) + for(int l=0;l<blob->shape(3);l++) + { + Dtype data=blob->data_at(i,j,k,l); + + os<<data<<std::endl; + } + + os.close(); + +} + +template <typename Dtype> +static void fill_blob_data(Blob<Dtype >* bottom, int fixed, float val) +{ + for(int i=0;i<bottom->num();i++) + for(int j=0;j<bottom->channels();j++) + for(int l=0;l<bottom->height();l++) + for(int k=0;k<bottom->width();k++) + { + int offset; + Dtype * ptr; + + offset=i*bottom->channels()*bottom->height()*bottom->width()+ + j*bottom->height()*bottom->width()+ + l*bottom->width()+k; + + ptr=bottom->mutable_cpu_data(); + + if(fixed) + ptr[offset]=val; + else + ptr[offset]=offset; + + } + + +} + + +template <typename TypeParam> +class LRNLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + + protected: + LRNLayerTest() + : epsilon_(Dtype(1e-5)), + blob_bottom_(new Blob<Dtype>()), + blob_top_(new Blob<Dtype>()) {} + virtual void SetUp() { + Caffe::set_random_seed(1701); + blob_bottom_->Reshape(2, 7, test_h,test_w); + // fill the values + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~LRNLayerTest() { delete blob_bottom_; delete blob_top_; } + void ReferenceLRNForward(const Blob<Dtype>& blob_bottom, + const LayerParameter& layer_param, Blob<Dtype>* blob_top); + + Dtype epsilon_; + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; +}; + +template <typename TypeParam> +void LRNLayerTest<TypeParam>::ReferenceLRNForward( + const Blob<Dtype>& blob_bottom, const LayerParameter& layer_param, + Blob<Dtype>* blob_top) { + typedef typename TypeParam::Dtype Dtype; + blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(), + blob_bottom.height(), blob_bottom.width()); + Dtype* top_data = blob_top->mutable_cpu_data(); + LRNParameter lrn_param = layer_param.lrn_param(); + Dtype alpha = lrn_param.alpha(); + Dtype beta = lrn_param.beta(); + int size = lrn_param.local_size(); + switch (lrn_param.norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + for (int n = 0; n < blob_bottom.num(); ++n) { + for (int c = 0; c < blob_bottom.channels(); ++c) { + for (int h = 0; h < blob_bottom.height(); ++h) { + for (int w = 0; w < blob_bottom.width(); ++w) { + int c_start = c - (size - 1) / 2; + int c_end = min(c_start + size, blob_bottom.channels()); + c_start = max(c_start, 0); + Dtype scale = 1.; + for (int i = c_start; i < c_end; ++i) { + Dtype value = blob_bottom.data_at(n, i, h, w); + scale += value * value * alpha / size; + } + *(top_data + blob_top->offset(n, c, h, w)) = + blob_bottom.data_at(n, c, h, w) / pow(scale, beta); + } + } + } + } + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + for (int n = 0; n < blob_bottom.num(); ++n) { + for (int c = 0; c < blob_bottom.channels(); ++c) { + for (int h = 0; h < blob_bottom.height(); ++h) { + int h_start = h - (size - 1) / 2; + int h_end = min(h_start + size, blob_bottom.height()); + h_start = max(h_start, 0); + for (int w = 0; w < blob_bottom.width(); ++w) { + Dtype scale = 1.; + int w_start = w - (size - 1) / 2; + int w_end = min(w_start + size, blob_bottom.width()); + w_start = max(w_start, 0); + +// std::cout<<"h,w ("<<h<<","<<w<<"): "; +// std::cout<<"box: ( h "<<h_start<<","<<h_end<<")"; +// std::cout<<" (w "<<w_start<<","<<w_end<<")"<<std::endl; + + for (int nh = h_start; nh < h_end; ++nh) { + for (int nw = w_start; nw < w_end; ++nw) { + Dtype value = blob_bottom.data_at(n, c, nh, nw); + scale += value * value * alpha / (size * size); + } + } + *(top_data + blob_top->offset(n, c, h, w)) = + blob_bottom.data_at(n, c, h, w) / pow(scale, beta); + } + } + } + } + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +typedef ::testing::Types<CPUDevice<float> > float_only; + +#define TestDtypesAndDevices float_only + +TYPED_TEST_CASE(LRNLayerTest, TestDtypesAndDevices); + +#if 1 +TYPED_TEST(LRNLayerTest, TestSetupAcrossChannels) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LRNLayer<Dtype> layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->channels(), 7); + EXPECT_EQ(this->blob_top_->height(), test_h); + EXPECT_EQ(this->blob_top_->width(), test_w); +} + +TYPED_TEST(LRNLayerTest, TestForwardAcrossChannels) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; +// LRNLayer<Dtype> layer(layer_param); + + layer_param.mutable_lrn_param()->set_local_size(3); + + layer_param.set_type("LRN"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<LRNLayer<Dtype> > layer= + boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer); + + vector<int> bottom_shape; + bottom_shape.push_back(1); + bottom_shape.push_back(5); + bottom_shape.push_back(5); + bottom_shape.push_back(5); + + + this->blob_bottom_vec_[0]->Reshape(bottom_shape); + + fill_blob_data(this->blob_bottom_,1,1); + + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + + Blob<Dtype> top_reference; + this->ReferenceLRNForward(*(this->blob_bottom_), layer_param, + &top_reference); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i], + this->epsilon_); + } +} + + + +TYPED_TEST(LRNLayerTest, TestForwardAcrossChannelsLargeRegion) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + layer_param.mutable_lrn_param()->set_local_size(15); + + + layer_param.set_type("LRN"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<LRNLayer<Dtype> > layer= + boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + Blob<Dtype> top_reference; + this->ReferenceLRNForward(*(this->blob_bottom_), layer_param, + &top_reference); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i], + this->epsilon_); + } +} + + +TYPED_TEST(LRNLayerTest, TestSetupWithinChannel) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + layer_param.mutable_lrn_param()->set_norm_region( + LRNParameter_NormRegion_WITHIN_CHANNEL); + layer_param.mutable_lrn_param()->set_local_size(3); + + + layer_param.set_type("LRN"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<LRNLayer<Dtype> > layer= + boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + EXPECT_EQ(this->blob_top_->num(), 2); + EXPECT_EQ(this->blob_top_->channels(), 7); + EXPECT_EQ(this->blob_top_->height(), test_h); + EXPECT_EQ(this->blob_top_->width(), test_w); +} +#endif + +#if 1 + +TYPED_TEST(LRNLayerTest, TestForwardWithinChannel) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + layer_param.mutable_lrn_param()->set_norm_region( + LRNParameter_NormRegion_WITHIN_CHANNEL); + layer_param.mutable_lrn_param()->set_local_size(3); +// layer_param.mutable_lrn_param()->set_beta(1); + + + layer_param.set_type("LRN"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<LRNLayer<Dtype> > layer= + boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer); + +/* presetting bottom_vec and data */ + + vector<int> bottom_shape; + bottom_shape.push_back(1); + bottom_shape.push_back(1); + bottom_shape.push_back(5); + bottom_shape.push_back(5); + + + this->blob_bottom_vec_[0]->Reshape(bottom_shape); + + fill_blob_data(this->blob_bottom_,1,1); + + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + + Blob<Dtype> top_reference; + this->ReferenceLRNForward(*(this->blob_bottom_), layer_param, + &top_reference); +// for (int i = 0; i < this->blob_bottom_->count(); ++i) { +// EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i], +// this->epsilon_); +// } + + dump_blob(this->blob_bottom_,"lrn.bottom.data"); + dump_blob(this->blob_top_,"lrn.top.data"); + dump_blob(&top_reference,"lrn.reftop.data"); + +} + +#endif + + +} // namespace caffe diff --git a/unit_tests/test_neuron_layer.cpp b/unit_tests/test_neuron_layer.cpp new file mode 100644 index 00000000..10c101da --- /dev/null +++ b/unit_tests/test_neuron_layer.cpp @@ -0,0 +1,358 @@ +#include <algorithm> +#include <vector> +#include <cmath> + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" + +#include "caffe/layers/absval_layer.hpp" +#include "caffe/layers/bnll_layer.hpp" +#include "caffe/layers/dropout_layer.hpp" +#include "caffe/layers/elu_layer.hpp" +#include "caffe/layers/exp_layer.hpp" +#include "caffe/layers/inner_product_layer.hpp" +#include "caffe/layers/log_layer.hpp" +#include "caffe/layers/power_layer.hpp" +#include "caffe/layers/prelu_layer.hpp" +#include "caffe/layers/relu_layer.hpp" +#include "caffe/layers/sigmoid_layer.hpp" +#include "caffe/layers/tanh_layer.hpp" +#include "caffe/layers/threshold_layer.hpp" + +#ifdef USE_CUDNN +#include "caffe/layers/cudnn_relu_layer.hpp" +#include "caffe/layers/cudnn_sigmoid_layer.hpp" +#include "caffe/layers/cudnn_tanh_layer.hpp" +#endif + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +typedef ::testing::Types<CPUDevice<float> > float_only; +#define TestDtypesAndDevices float_only + + +#define SET_LAYER(name) \ + layer_param.set_type(#name);\ + shared_ptr<Layer<Dtype> > new_layer=\ + LayerRegistry<Dtype>::CreateLayer(layer_param);\ + shared_ptr< name ## Layer <Dtype> > layer= \ + boost::static_pointer_cast< name ## Layer <Dtype> > (new_layer);\ + if(0) layer=shared_ptr<name ## Layer<Dtype> >(new name ## Layer<Dtype>(layer_param));\ + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + +template <typename TypeParam> +class NeuronLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + + protected: + NeuronLayerTest() + : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)), + blob_top_(new Blob<Dtype>()) { + Caffe::set_random_seed(1701); + // fill the values + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~NeuronLayerTest() { delete blob_bottom_; delete blob_top_; } + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; + + + void TestPReLU(PReLULayer<Dtype> *layer) { + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Now, check values + const Dtype* bottom_data = this->blob_bottom_->cpu_data(); + const Dtype* top_data = this->blob_top_->cpu_data(); + const Dtype* slope_data = layer->blobs()[0]->cpu_data(); + int hw = this->blob_bottom_->height() * this->blob_bottom_->width(); + int channels = this->blob_bottom_->channels(); + bool channel_shared = layer->layer_param().prelu_param().channel_shared(); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + int c = channel_shared ? 0 : (i / hw) % channels; + EXPECT_EQ(top_data[i], + std::max(bottom_data[i], (Dtype)(0)) + + slope_data[c] * std::min(bottom_data[i], (Dtype)(0))); + } + } + +}; + +TYPED_TEST_CASE(NeuronLayerTest, TestDtypesAndDevices); + +TYPED_TEST(NeuronLayerTest, TestAbsVal) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + + SET_LAYER(AbsVal); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + const Dtype* bottom_data = this->blob_bottom_->cpu_data(); + const Dtype* top_data = this->blob_top_->cpu_data(); + const int count = this->blob_bottom_->count(); + for (int i = 0; i < count; ++i) { + EXPECT_EQ(top_data[i], fabs(bottom_data[i])); + } +} + + +TYPED_TEST(NeuronLayerTest, TestReLU) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + + + SET_LAYER(ReLU); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Now, check values + const Dtype* bottom_data = this->blob_bottom_->cpu_data(); + const Dtype* top_data = this->blob_top_->cpu_data(); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + EXPECT_GE(top_data[i], 0.); + EXPECT_TRUE(top_data[i] == 0 || top_data[i] == bottom_data[i]); + } +} + +#if 1 + +TYPED_TEST(NeuronLayerTest, TestReLUWithNegativeSlope) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + CHECK(google::protobuf::TextFormat::ParseFromString( + "relu_param { negative_slope: 0.01 }", &layer_param)); + + SET_LAYER(ReLU); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Now, check values + const Dtype* bottom_data = this->blob_bottom_->cpu_data(); + const Dtype* top_data = this->blob_top_->cpu_data(); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + if (top_data[i] >= 0) { + EXPECT_FLOAT_EQ(top_data[i], bottom_data[i]); + } else { + EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] * 0.01); + } + } +} + + +TYPED_TEST(NeuronLayerTest, TestSigmoid) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + + SET_LAYER(Sigmoid); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // Now, check values + const Dtype* bottom_data = this->blob_bottom_->cpu_data(); + const Dtype* top_data = this->blob_top_->cpu_data(); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + EXPECT_FLOAT_EQ(top_data[i], 1. / (1 + exp(-bottom_data[i]))); + // check that we squashed the value between 0 and 1 + EXPECT_GE(top_data[i], 0.); + EXPECT_LE(top_data[i], 1.); + } +} + +TYPED_TEST(NeuronLayerTest, TestTanH) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + + int number=10; + + this->blob_bottom_->Reshape(1,2,number,2); + + for(int i=0;i<number;i++) + this->blob_bottom_->mutable_cpu_data()[i]=i*10; + + SET_LAYER(TanH); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Test exact values + for (int i = 0; i < this->blob_bottom_->num(); ++i) { + for (int j = 0; j < this->blob_bottom_->channels(); ++j) { + for (int k = 0; k < this->blob_bottom_->height(); ++k) { + for (int l = 0; l < this->blob_bottom_->width(); ++l) { + + EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4, + (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) / + (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1)); + EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4, + (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) / + (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1)); + } + } + } + } +} + + +TYPED_TEST(NeuronLayerTest, TestBNLL) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + + SET_LAYER(BNLL); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Now, check values + const Dtype* bottom_data = this->blob_bottom_->cpu_data(); + const Dtype* top_data = this->blob_top_->cpu_data(); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + Dtype target=log(1+exp(bottom_data[i])); + EXPECT_NEAR(top_data[i], target,1e-4); + } +} +#endif + +#if 0 /* Not try PReLU now */ + +TYPED_TEST(NeuronLayerTest, TestPReLUParam) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PReLULayer<Dtype> layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* slopes = layer.blobs()[0]->cpu_data(); + int count = layer.blobs()[0]->count(); + for (int i = 0; i < count; ++i, ++slopes) { + EXPECT_EQ(*slopes, 0.25); + } +} + +TYPED_TEST(NeuronLayerTest, TestPReLUForward) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PReLULayer<Dtype> layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(layer.blobs()[0].get()); + this->TestPReLU(&layer); +} + +TYPED_TEST(NeuronLayerTest, TestPReLUForwardChannelShared) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + layer_param.mutable_prelu_param()->set_channel_shared(true); + PReLULayer<Dtype> layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + this->TestPReLU(&layer); +} + + +TYPED_TEST(NeuronLayerTest, TestPReLUConsistencyReLU) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter prelu_layer_param; + LayerParameter relu_layer_param; + relu_layer_param.mutable_relu_param()->set_negative_slope(0.25); + PReLULayer<Dtype> prelu(prelu_layer_param); + ReLULayer<Dtype> relu(relu_layer_param); + // Set up blobs + vector<Blob<Dtype>*> blob_bottom_vec_2; + vector<Blob<Dtype>*> blob_top_vec_2; + shared_ptr<Blob<Dtype> > blob_bottom_2(new Blob<Dtype>()); + shared_ptr<Blob<Dtype> > blob_top_2(new Blob<Dtype>()); + blob_bottom_vec_2.push_back(blob_bottom_2.get()); + blob_top_vec_2.push_back(blob_top_2.get()); + blob_bottom_2->CopyFrom(*this->blob_bottom_, false, true); + // SetUp layers + prelu.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + relu.SetUp(blob_bottom_vec_2, blob_top_vec_2); + // Check forward + prelu.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + relu.Forward(this->blob_bottom_vec_, blob_top_vec_2); + for (int s = 0; s < blob_top_2->count(); ++s) { + EXPECT_EQ(this->blob_top_->cpu_data()[s], blob_top_2->cpu_data()[s]); + } + // Check backward +} + +TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) { + typedef typename TypeParam::Dtype Dtype; + // Set layer parameters + LayerParameter ip_layer_param; + LayerParameter prelu_layer_param; + InnerProductParameter *ip_param = + ip_layer_param.mutable_inner_product_param(); + ip_param->mutable_weight_filler()->set_type("gaussian"); + ip_param->set_num_output(3); + InnerProductLayer<Dtype> ip(ip_layer_param); + PReLULayer<Dtype> prelu(prelu_layer_param); + InnerProductLayer<Dtype> ip2(ip_layer_param); + PReLULayer<Dtype> prelu2(prelu_layer_param); + // Set up blobs + vector<Blob<Dtype>*> blob_bottom_vec_2; + vector<Blob<Dtype>*> blob_middle_vec_2; + vector<Blob<Dtype>*> blob_top_vec_2; + shared_ptr<Blob<Dtype> > blob_bottom_2(new Blob<Dtype>()); + shared_ptr<Blob<Dtype> > blob_middle_2(new Blob<Dtype>()); + shared_ptr<Blob<Dtype> > blob_top_2(new Blob<Dtype>()); + blob_bottom_vec_2.push_back(blob_bottom_2.get()); + blob_middle_vec_2.push_back(blob_middle_2.get()); + blob_top_vec_2.push_back(blob_top_2.get()); + blob_bottom_2->CopyFrom(*this->blob_bottom_, false, true); + // SetUp layers + ip.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + prelu.SetUp(this->blob_top_vec_, this->blob_top_vec_); + ip2.SetUp(blob_bottom_vec_2, blob_middle_vec_2); + prelu2.SetUp(blob_middle_vec_2, blob_top_vec_2); + caffe_copy(ip2.blobs()[0]->count(), ip.blobs()[0]->cpu_data(), + ip2.blobs()[0]->mutable_cpu_data()); + // Forward in-place + ip.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + prelu.Forward(this->blob_top_vec_, this->blob_top_vec_); + // Forward non-in-place + ip2.Forward(blob_bottom_vec_2, blob_middle_vec_2); + prelu2.Forward(blob_middle_vec_2, blob_top_vec_2); + // Check numbers + for (int s = 0; s < blob_top_2->count(); ++s) { + EXPECT_EQ(this->blob_top_->cpu_data()[s], blob_top_2->cpu_data()[s]); + } + // Fill top diff with random numbers + shared_ptr<Blob<Dtype> > tmp_blob(new Blob<Dtype>()); + tmp_blob->ReshapeLike(*blob_top_2.get()); + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(tmp_blob.get()); + caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(), + this->blob_top_->mutable_cpu_diff()); + caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(), + blob_top_2->mutable_cpu_diff()); + // Backward in-place + vector<bool> propagate_down; + propagate_down.push_back(true); + prelu.Backward(this->blob_top_vec_, propagate_down, this->blob_top_vec_); + ip.Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_); + // Backward non-in-place + prelu2.Backward(blob_top_vec_2, propagate_down, blob_middle_vec_2); + ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2); + // Check numbers + for (int s = 0; s < blob_bottom_2->count(); ++s) { + EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]); + } + for (int s = 0; s < ip.blobs()[0]->count(); ++s) { + EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]); + } + for (int s = 0; s < ip.blobs()[1]->count(); ++s) { + EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]); + } + for (int s = 0; s < prelu.blobs()[0]->count(); ++s) { + EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s], + prelu2.blobs()[0]->cpu_diff()[s]); + } +} + +#endif + +} // namespace caffe diff --git a/unit_tests/test_pooling_layer.cpp b/unit_tests/test_pooling_layer.cpp new file mode 100644 index 00000000..bde88448 --- /dev/null +++ b/unit_tests/test_pooling_layer.cpp @@ -0,0 +1,652 @@ +#include <vector> + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/pooling_layer.hpp" + +#ifdef USE_CUDNN +#include "caffe/layers/cudnn_pooling_layer.hpp" +#endif + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +typedef ::testing::Types<CPUDevice<float> > float_only; +#define TestDtypesAndDevices float_only + + +#define SET_LAYER(name) \ + layer_param.set_type(#name);\ + shared_ptr<Layer<Dtype> > new_layer=\ + LayerRegistry<Dtype>::CreateLayer(layer_param);\ + shared_ptr< name ## Layer <Dtype> > layer= \ + boost::static_pointer_cast< name ## Layer <Dtype> > (new_layer);\ + if(0) layer=shared_ptr<name ## Layer<Dtype> >(new name ## Layer<Dtype>(layer_param));\ + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + +template <typename TypeParam> +class PoolingLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + + protected: + PoolingLayerTest() + : blob_bottom_(new Blob<Dtype>()), + blob_top_(new Blob<Dtype>()), + blob_top_mask_(new Blob<Dtype>()) {} + virtual void SetUp() { + Caffe::set_random_seed(1701); + blob_bottom_->Reshape(2, 3, 6, 5); + // fill the values + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~PoolingLayerTest() { + delete blob_bottom_; + delete blob_top_; + delete blob_top_mask_; + } + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_top_; + Blob<Dtype>* const blob_top_mask_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; + // Test for 2x 2 square pooling layer + void TestForwardSquare() { + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_size(2); + pooling_param->set_pool(PoolingParameter_PoolMethod_MAX); + const int num = 2; + const int channels = 2; + blob_bottom_->Reshape(num, channels, 3, 5); + // Input: 2x 2 channels of: + // [1 2 5 2 3] + // [9 4 1 4 8] + // [1 2 5 2 3] + for (int i = 0; i < 15 * num * channels; i += 15) { + blob_bottom_->mutable_cpu_data()[i + 0] = 1; + blob_bottom_->mutable_cpu_data()[i + 1] = 2; + blob_bottom_->mutable_cpu_data()[i + 2] = 5; + blob_bottom_->mutable_cpu_data()[i + 3] = 2; + blob_bottom_->mutable_cpu_data()[i + 4] = 3; + blob_bottom_->mutable_cpu_data()[i + 5] = 9; + blob_bottom_->mutable_cpu_data()[i + 6] = 4; + blob_bottom_->mutable_cpu_data()[i + 7] = 1; + blob_bottom_->mutable_cpu_data()[i + 8] = 4; + blob_bottom_->mutable_cpu_data()[i + 9] = 8; + blob_bottom_->mutable_cpu_data()[i + 10] = 1; + blob_bottom_->mutable_cpu_data()[i + 11] = 2; + blob_bottom_->mutable_cpu_data()[i + 12] = 5; + blob_bottom_->mutable_cpu_data()[i + 13] = 2; + blob_bottom_->mutable_cpu_data()[i + 14] = 3; + } + + SET_LAYER(Pooling); + + EXPECT_EQ(blob_top_->num(), num); + EXPECT_EQ(blob_top_->channels(), channels); + EXPECT_EQ(blob_top_->height(), 2); + EXPECT_EQ(blob_top_->width(), 4); + if (blob_top_vec_.size() > 1) { + EXPECT_EQ(blob_top_mask_->num(), num); + EXPECT_EQ(blob_top_mask_->channels(), channels); + EXPECT_EQ(blob_top_mask_->height(), 2); + EXPECT_EQ(blob_top_mask_->width(), 4); + } + layer->Forward(blob_bottom_vec_, blob_top_vec_); + // Expected output: 2x 2 channels of: + // [9 5 5 8] + // [9 5 5 8] + for (int i = 0; i < 8 * num * channels; i += 8) { + EXPECT_EQ(blob_top_->cpu_data()[i + 0], 9); + EXPECT_EQ(blob_top_->cpu_data()[i + 1], 5); + EXPECT_EQ(blob_top_->cpu_data()[i + 2], 5); + EXPECT_EQ(blob_top_->cpu_data()[i + 3], 8); + EXPECT_EQ(blob_top_->cpu_data()[i + 4], 9); + EXPECT_EQ(blob_top_->cpu_data()[i + 5], 5); + EXPECT_EQ(blob_top_->cpu_data()[i + 6], 5); + EXPECT_EQ(blob_top_->cpu_data()[i + 7], 8); + } + if (blob_top_vec_.size() > 1) { + // Expected mask output: 2x 2 channels of: + // [5 2 2 9] + // [5 12 12 9] + for (int i = 0; i < 8 * num * channels; i += 8) { + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0], 5); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1], 2); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2], 2); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3], 9); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4], 5); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 12); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 12); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7], 9); + } + } + } + // Test for 3x 2 rectangular pooling layer with kernel_h > kernel_w + void TestForwardRectHigh() { + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_h(3); + pooling_param->set_kernel_w(2); + pooling_param->set_pool(PoolingParameter_PoolMethod_MAX); + const int num = 2; + const int channels = 2; + blob_bottom_->Reshape(num, channels, 6, 6); + // Input: 2x 2 channels of: + // [35 1 6 26 19 24] + // [ 3 32 7 21 23 25] + // [31 9 2 22 27 20] + // [ 8 28 33 17 10 15] + // [30 5 34 12 14 16] + // [ 4 36 29 13 18 11] + // (this is generated by magic(6) in MATLAB) + for (int i = 0; i < 36 * num * channels; i += 36) { + blob_bottom_->mutable_cpu_data()[i + 0] = 35; + blob_bottom_->mutable_cpu_data()[i + 1] = 1; + blob_bottom_->mutable_cpu_data()[i + 2] = 6; + blob_bottom_->mutable_cpu_data()[i + 3] = 26; + blob_bottom_->mutable_cpu_data()[i + 4] = 19; + blob_bottom_->mutable_cpu_data()[i + 5] = 24; + blob_bottom_->mutable_cpu_data()[i + 6] = 3; + blob_bottom_->mutable_cpu_data()[i + 7] = 32; + blob_bottom_->mutable_cpu_data()[i + 8] = 7; + blob_bottom_->mutable_cpu_data()[i + 9] = 21; + blob_bottom_->mutable_cpu_data()[i + 10] = 23; + blob_bottom_->mutable_cpu_data()[i + 11] = 25; + blob_bottom_->mutable_cpu_data()[i + 12] = 31; + blob_bottom_->mutable_cpu_data()[i + 13] = 9; + blob_bottom_->mutable_cpu_data()[i + 14] = 2; + blob_bottom_->mutable_cpu_data()[i + 15] = 22; + blob_bottom_->mutable_cpu_data()[i + 16] = 27; + blob_bottom_->mutable_cpu_data()[i + 17] = 20; + blob_bottom_->mutable_cpu_data()[i + 18] = 8; + blob_bottom_->mutable_cpu_data()[i + 19] = 28; + blob_bottom_->mutable_cpu_data()[i + 20] = 33; + blob_bottom_->mutable_cpu_data()[i + 21] = 17; + blob_bottom_->mutable_cpu_data()[i + 22] = 10; + blob_bottom_->mutable_cpu_data()[i + 23] = 15; + blob_bottom_->mutable_cpu_data()[i + 24] = 30; + blob_bottom_->mutable_cpu_data()[i + 25] = 5; + blob_bottom_->mutable_cpu_data()[i + 26] = 34; + blob_bottom_->mutable_cpu_data()[i + 27] = 12; + blob_bottom_->mutable_cpu_data()[i + 28] = 14; + blob_bottom_->mutable_cpu_data()[i + 29] = 16; + blob_bottom_->mutable_cpu_data()[i + 30] = 4; + blob_bottom_->mutable_cpu_data()[i + 31] = 36; + blob_bottom_->mutable_cpu_data()[i + 32] = 29; + blob_bottom_->mutable_cpu_data()[i + 33] = 13; + blob_bottom_->mutable_cpu_data()[i + 34] = 18; + blob_bottom_->mutable_cpu_data()[i + 35] = 11; + } + + SET_LAYER(Pooling); + + EXPECT_EQ(blob_top_->channels(), channels); + EXPECT_EQ(blob_top_->height(), 4); + EXPECT_EQ(blob_top_->width(), 5); + if (blob_top_vec_.size() > 1) { + EXPECT_EQ(blob_top_mask_->num(), num); + EXPECT_EQ(blob_top_mask_->channels(), channels); + EXPECT_EQ(blob_top_mask_->height(), 4); + EXPECT_EQ(blob_top_mask_->width(), 5); + } + layer->Forward(blob_bottom_vec_, blob_top_vec_); + // Expected output: 2x 2 channels of: + // [35 32 26 27 27] + // [32 33 33 27 27] + // [31 34 34 27 27] + // [36 36 34 18 18] + for (int i = 0; i < 20 * num * channels; i += 20) { + EXPECT_EQ(blob_top_->cpu_data()[i + 0], 35); + EXPECT_EQ(blob_top_->cpu_data()[i + 1], 32); + EXPECT_EQ(blob_top_->cpu_data()[i + 2], 26); + EXPECT_EQ(blob_top_->cpu_data()[i + 3], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 4], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 5], 32); + EXPECT_EQ(blob_top_->cpu_data()[i + 6], 33); + EXPECT_EQ(blob_top_->cpu_data()[i + 7], 33); + EXPECT_EQ(blob_top_->cpu_data()[i + 8], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 9], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 10], 31); + EXPECT_EQ(blob_top_->cpu_data()[i + 11], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 13], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 14], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 15], 36); + EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36); + EXPECT_EQ(blob_top_->cpu_data()[i + 17], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 18], 18); + EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18); + } + if (blob_top_vec_.size() > 1) { + // [ 1 8 4 17 17] + // [ 8 21 21 17 17] + // [13 27 27 17 17] + // [32 32 27 35 35] + for (int i = 0; i < 20 * num * channels; i += 20) { + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0], 0); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1], 7); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2], 3); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 7); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 20); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7], 20); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 8], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 9], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 12); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 31); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 34); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34); + } + } + } + // Test for rectangular pooling layer with kernel_w > kernel_h + void TestForwardRectWide() { + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_h(2); + pooling_param->set_kernel_w(3); + pooling_param->set_pool(PoolingParameter_PoolMethod_MAX); + const int num = 2; + const int channels = 2; + blob_bottom_->Reshape(num, channels, 6, 6); + // Input: 2x 2 channels of: + // [35 1 6 26 19 24] + // [ 3 32 7 21 23 25] + // [31 9 2 22 27 20] + // [ 8 28 33 17 10 15] + // [30 5 34 12 14 16] + // [ 4 36 29 13 18 11] + // (this is generated by magic(6) in MATLAB) + for (int i = 0; i < 36 * num * channels; i += 36) { + blob_bottom_->mutable_cpu_data()[i + 0] = 35; + blob_bottom_->mutable_cpu_data()[i + 1] = 1; + blob_bottom_->mutable_cpu_data()[i + 2] = 6; + blob_bottom_->mutable_cpu_data()[i + 3] = 26; + blob_bottom_->mutable_cpu_data()[i + 4] = 19; + blob_bottom_->mutable_cpu_data()[i + 5] = 24; + blob_bottom_->mutable_cpu_data()[i + 6] = 3; + blob_bottom_->mutable_cpu_data()[i + 7] = 32; + blob_bottom_->mutable_cpu_data()[i + 8] = 7; + blob_bottom_->mutable_cpu_data()[i + 9] = 21; + blob_bottom_->mutable_cpu_data()[i + 10] = 23; + blob_bottom_->mutable_cpu_data()[i + 11] = 25; + blob_bottom_->mutable_cpu_data()[i + 12] = 31; + blob_bottom_->mutable_cpu_data()[i + 13] = 9; + blob_bottom_->mutable_cpu_data()[i + 14] = 2; + blob_bottom_->mutable_cpu_data()[i + 15] = 22; + blob_bottom_->mutable_cpu_data()[i + 16] = 27; + blob_bottom_->mutable_cpu_data()[i + 17] = 20; + blob_bottom_->mutable_cpu_data()[i + 18] = 8; + blob_bottom_->mutable_cpu_data()[i + 19] = 28; + blob_bottom_->mutable_cpu_data()[i + 20] = 33; + blob_bottom_->mutable_cpu_data()[i + 21] = 17; + blob_bottom_->mutable_cpu_data()[i + 22] = 10; + blob_bottom_->mutable_cpu_data()[i + 23] = 15; + blob_bottom_->mutable_cpu_data()[i + 24] = 30; + blob_bottom_->mutable_cpu_data()[i + 25] = 5; + blob_bottom_->mutable_cpu_data()[i + 26] = 34; + blob_bottom_->mutable_cpu_data()[i + 27] = 12; + blob_bottom_->mutable_cpu_data()[i + 28] = 14; + blob_bottom_->mutable_cpu_data()[i + 29] = 16; + blob_bottom_->mutable_cpu_data()[i + 30] = 4; + blob_bottom_->mutable_cpu_data()[i + 31] = 36; + blob_bottom_->mutable_cpu_data()[i + 32] = 29; + blob_bottom_->mutable_cpu_data()[i + 33] = 13; + blob_bottom_->mutable_cpu_data()[i + 34] = 18; + blob_bottom_->mutable_cpu_data()[i + 35] = 11; + } + + SET_LAYER(Pooling); + + EXPECT_EQ(blob_top_->num(), num); + EXPECT_EQ(blob_top_->channels(), channels); + EXPECT_EQ(blob_top_->height(), 5); + EXPECT_EQ(blob_top_->width(), 4); + if (blob_top_vec_.size() > 1) { + EXPECT_EQ(blob_top_mask_->num(), num); + EXPECT_EQ(blob_top_mask_->channels(), channels); + EXPECT_EQ(blob_top_mask_->height(), 5); + EXPECT_EQ(blob_top_mask_->width(), 4); + } + layer->Forward(blob_bottom_vec_, blob_top_vec_); + // Expected output: 2x 2 channels of: + // [35 32 26 26] + // [32 32 27 27] + // [33 33 33 27] + // [34 34 34 17] + // [36 36 34 18] + for (int i = 0; i < 20 * num * channels; i += 20) { + EXPECT_EQ(blob_top_->cpu_data()[i + 0], 35); + EXPECT_EQ(blob_top_->cpu_data()[i + 1], 32); + EXPECT_EQ(blob_top_->cpu_data()[i + 2], 26); + EXPECT_EQ(blob_top_->cpu_data()[i + 3], 26); + EXPECT_EQ(blob_top_->cpu_data()[i + 4], 32); + EXPECT_EQ(blob_top_->cpu_data()[i + 5], 32); + EXPECT_EQ(blob_top_->cpu_data()[i + 6], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 7], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 8], 33); + EXPECT_EQ(blob_top_->cpu_data()[i + 9], 33); + EXPECT_EQ(blob_top_->cpu_data()[i + 10], 33); + EXPECT_EQ(blob_top_->cpu_data()[i + 11], 27); + EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 13], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 14], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 15], 17); + EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36); + EXPECT_EQ(blob_top_->cpu_data()[i + 17], 36); + EXPECT_EQ(blob_top_->cpu_data()[i + 18], 34); + EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18); + } + if (blob_top_vec_.size() > 1) { + // [ 1 8 4 4] + // [ 8 8 17 17] + // [21 21 21 17] + // [27 27 27 22] + // [32 32 27 35] + for (int i = 0; i < 20 * num * channels; i += 20) { + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0], 0); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1], 7); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2], 3); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3], 3); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4], 7); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 7); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 8], 20); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 9], 20); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 20); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 16); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 21); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 31); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 26); + EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34); + } + } + } +}; + + + + +TYPED_TEST_CASE(PoolingLayerTest, TestDtypesAndDevices); + +TYPED_TEST(PoolingLayerTest, TestSetup) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_size(3); + pooling_param->set_stride(2); + + SET_LAYER(Pooling); + //PoolingLayer<Dtype> layer(layer_param); + //layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num()); + EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels()); + EXPECT_EQ(this->blob_top_->height(), 3); + EXPECT_EQ(this->blob_top_->width(), 2); +} + +TYPED_TEST(PoolingLayerTest, TestSetupPadded) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_size(3); + pooling_param->set_stride(2); + pooling_param->set_pad(1); + pooling_param->set_pool(PoolingParameter_PoolMethod_AVE); +// PoolingLayer<Dtype> layer(layer_param); +// layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + SET_LAYER(Pooling); + + EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num()); + EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels()); + EXPECT_EQ(this->blob_top_->height(), 4); + EXPECT_EQ(this->blob_top_->width(), 3); +} + +TYPED_TEST(PoolingLayerTest, TestSetupGlobalPooling) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_global_pooling(true); + pooling_param->set_pool(PoolingParameter_PoolMethod_AVE); + +// PoolingLayer<Dtype> layer(layer_param); +// layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + + SET_LAYER(Pooling); + + EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num()); + EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels()); + EXPECT_EQ(this->blob_top_->height(), 1); + EXPECT_EQ(this->blob_top_->width(), 1); +} + +TYPED_TEST(PoolingLayerTest, TestForwardMax) { + this->TestForwardSquare(); + this->TestForwardRectHigh(); + this->TestForwardRectWide(); +} + +TYPED_TEST(PoolingLayerTest, TestForwardMaxTopMask) { + this->blob_top_vec_.push_back(this->blob_top_mask_); + this->TestForwardSquare(); + this->TestForwardRectHigh(); + this->TestForwardRectWide(); +} + +TYPED_TEST(PoolingLayerTest, TestForwardMaxPadded) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_size(3); + pooling_param->set_stride(2); + pooling_param->set_pad(2); + pooling_param->set_pool(PoolingParameter_PoolMethod_MAX); + this->blob_bottom_->Reshape(1, 1, 3, 3); + // Input: + // [ 1 2 4 ] + // [ 2 3 2 ] + // [ 4 2 1 ] + this->blob_bottom_->mutable_cpu_data()[0] = 1; + this->blob_bottom_->mutable_cpu_data()[1] = 2; + this->blob_bottom_->mutable_cpu_data()[2] = 4; + this->blob_bottom_->mutable_cpu_data()[3] = 2; + this->blob_bottom_->mutable_cpu_data()[4] = 3; + this->blob_bottom_->mutable_cpu_data()[5] = 2; + this->blob_bottom_->mutable_cpu_data()[6] = 4; + this->blob_bottom_->mutable_cpu_data()[7] = 2; + this->blob_bottom_->mutable_cpu_data()[8] = 1; + + + SET_LAYER(Pooling); + + EXPECT_EQ(this->blob_top_->num(), 1); + EXPECT_EQ(this->blob_top_->channels(), 1); + EXPECT_EQ(this->blob_top_->height(), 3); + EXPECT_EQ(this->blob_top_->width(), 3); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + Dtype epsilon = 1e-8; + // Output: + // [ 1 4 4 ] + // [ 4 4 4 ] + // [ 4 4 1 ] + EXPECT_NEAR(this->blob_top_->cpu_data()[0], 1, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[2], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[4], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[6], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[8], 1, epsilon); +} + +template <typename Dtype> +void fill_bottom_data(Blob<Dtype >* bottom) +{ + for(int i=0;i<bottom->num();i++) + for(int j=0;j<bottom->channels();j++) + for(int l=0;l<bottom->height();l++) + for(int k=0;k<bottom->width();k++) + { + int offset; + Dtype * ptr; + + offset=i*bottom->channels()*bottom->height()*bottom->width()+ + j*bottom->height()*bottom->width()+ + l*bottom->width()+k; + + ptr=bottom->mutable_cpu_data(); + + ptr[offset]=offset; + + } + + +} + +template <typename Dtype> +void check_top_data(Blob<Dtype>* bottom, Blob<Dtype>* top, int kernel_size, int stride) +{ + Dtype epsilon = 1e-5; + int error_count=0; + + for(int i=0;i<top->num();i++) + for(int j=0;j<top->channels();j++) + for(int l=0;l<top->height();l++) + for(int k=0;k<top->width();k++) + { + Dtype pool_data=top->data_at(i,j,l,k); + Dtype max_bottom_data=-100000000; + + int top_h=stride*l; + int top_w=stride*k; + + /* calculate max **/ + for(int x=0;x<kernel_size;x++) + for(int y=0;y<kernel_size;y++) + { + Dtype d=bottom->data_at(i,j,top_h+x,top_w+y); + + if(d>max_bottom_data) + max_bottom_data=d; + } + + Dtype offset=pool_data-max_bottom_data; + if(offset>epsilon || offset<-epsilon) + + { + std::cout<<error_count<<": "; + std::cout<<i<<" "<<j<<" "<<l<<" "<<k<<" bottom: "; + std::cout<<top_h<<" "<<top_w<<std::endl; + std::cout<<"pooled: "<<pool_data<<" bottom:"<<max_bottom_data<<std::endl; + + error_count++; + } + + if(error_count==10) + return ; + } + + + +} + + +TYPED_TEST(PoolingLayerTest, TestMax) { + typedef typename TypeParam::Dtype Dtype; + + int kernel_size=3; + int stride=2; + int channel_number=96; + int h=55; + int w=55; + + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_size(kernel_size); + pooling_param->set_stride(stride); + pooling_param->set_pad(0); + pooling_param->set_pool(PoolingParameter_PoolMethod_MAX); + this->blob_bottom_->Reshape(1, channel_number, h, w); + FillerParameter filler_param; + filler_param.set_value(Dtype(2)); + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + + SET_LAYER(Pooling); + + //fill_bottom_data(this->blob_bottom_); + + #if 0 + EXPECT_EQ(this->blob_top_->num(), 1); + EXPECT_EQ(this->blob_top_->channels(), 96); + EXPECT_EQ(this->blob_top_->height(), 27); + EXPECT_EQ(this->blob_top_->width(), 27); +#endif + + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + check_top_data(this->blob_bottom_,this->blob_top_,kernel_size,stride); + +} + +TYPED_TEST(PoolingLayerTest, TestForwardAve) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + PoolingParameter* pooling_param = layer_param.mutable_pooling_param(); + pooling_param->set_kernel_size(3); + pooling_param->set_stride(1); + pooling_param->set_pad(1); + pooling_param->set_pool(PoolingParameter_PoolMethod_AVE); + this->blob_bottom_->Reshape(1, 1, 3, 3); + FillerParameter filler_param; + filler_param.set_value(Dtype(2)); + ConstantFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + PoolingLayer<Dtype> layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(this->blob_top_->num(), 1); + EXPECT_EQ(this->blob_top_->channels(), 1); + EXPECT_EQ(this->blob_top_->height(), 3); + EXPECT_EQ(this->blob_top_->width(), 3); + layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + Dtype epsilon = 1e-5; + EXPECT_NEAR(this->blob_top_->cpu_data()[0], 8.0 / 9, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4.0 / 3, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[2], 8.0 / 9, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4.0 / 3, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[4], 2.0 , epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4.0 / 3, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[6], 8.0 / 9, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4.0 / 3, epsilon); + EXPECT_NEAR(this->blob_top_->cpu_data()[8], 8.0 / 9, epsilon); +} + + +} // namespace caffe diff --git a/unit_tests/test_softmax_layer.cpp b/unit_tests/test_softmax_layer.cpp new file mode 100644 index 00000000..ab2f8362 --- /dev/null +++ b/unit_tests/test_softmax_layer.cpp @@ -0,0 +1,99 @@ +#include <cmath> +#include <vector> + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layers/softmax_layer.hpp" + +#ifdef USE_CUDNN +#include "caffe/layers/cudnn_softmax_layer.hpp" +#endif + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template <typename TypeParam> +class SoftmaxLayerTest : public MultiDeviceTest<TypeParam> { + typedef typename TypeParam::Dtype Dtype; + protected: + SoftmaxLayerTest() + : blob_bottom_(new Blob<Dtype>(2, 10, 1, 1)), + blob_top_(new Blob<Dtype>()) { + // fill the values + FillerParameter filler_param; + GaussianFiller<Dtype> filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~SoftmaxLayerTest() { delete blob_bottom_; delete blob_top_; } + Blob<Dtype>* const blob_bottom_; + Blob<Dtype>* const blob_top_; + vector<Blob<Dtype>*> blob_bottom_vec_; + vector<Blob<Dtype>*> blob_top_vec_; +}; + + +typedef ::testing::Types<CPUDevice<float> > float_only; + +#define TestDtypesAndDevices float_only + + +TYPED_TEST_CASE(SoftmaxLayerTest, TestDtypesAndDevices); + +TYPED_TEST(SoftmaxLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + + + + layer_param.set_type("Softmax"); + + shared_ptr<Layer<Dtype> > new_layer= + LayerRegistry<Dtype>::CreateLayer(layer_param); + + shared_ptr<SoftmaxLayer<Dtype> > layer= + boost::static_pointer_cast<SoftmaxLayer<Dtype> > (new_layer); + +// layer=shared_ptr<SoftmaxLayer<Dtype> >(new SoftmaxLayer<Dtype>(layer_param)); + + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + + // Test sum + for (int i = 0; i < this->blob_bottom_->num(); ++i) { + for (int k = 0; k < this->blob_bottom_->height(); ++k) { + for (int l = 0; l < this->blob_bottom_->width(); ++l) { + Dtype sum = 0; + for (int j = 0; j < this->blob_top_->channels(); ++j) { + sum += this->blob_top_->data_at(i, j, k, l); + } + EXPECT_GE(sum, 0.999); + EXPECT_LE(sum, 1.001); + // Test exact values + Dtype scale = 0; + for (int j = 0; j < this->blob_bottom_->channels(); ++j) { + scale += exp(this->blob_bottom_->data_at(i, j, k, l)); + } + for (int j = 0; j < this->blob_bottom_->channels(); ++j) { + EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4, + exp(this->blob_bottom_->data_at(i, j, k, l)) / scale) + << "debug: " << i << " " << j; + EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4, + exp(this->blob_bottom_->data_at(i, j, k, l)) / scale) + << "debug: " << i << " " << j; + } + } + } + } +} + + + +} // namespace caffe diff --git a/unit_tests/testbed.c b/unit_tests/testbed.c new file mode 100644 index 00000000..d33433d0 --- /dev/null +++ b/unit_tests/testbed.c @@ -0,0 +1,146 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <unistd.h> +#include <getopt.h> +#include <string.h> + + +#include "pmu.h" +#include "testbed.h" + +struct armv8_event +{ + char * name; + int id; + uint32_t init_val; + char * note; +}; + + +static struct armv8_event a57_list[6]= +{ + {"INST",0x8,0,"instruction retired"}, + {"CYCL",0x11,0,"CPU running cycle"}, + {"L1D MISS",0x3,0,"L1D CACHE MISS/REFILL"}, + {"L1D ACCESS",0x4,0,"L1D CACHE ACCESS"}, + {"L2 MISS",0x17,0,"L2 CACHE MISS/REFILL"}, + {"L2 ACCESS",0x16,0,"L2 CACHE ACCESS"} +}; + +static int e[6]; + +void init_testbed(void) +{ + int i; + struct armv8_event * p_list; + + init_pmu_registers(); + + p_list=a57_list; + + for(i=0;i<6;i++) + { + e[i]=create_pmu_event(p_list[i].name,p_list[i].id, + p_list[i].init_val,p_list[i].note); + } + +} + +void run_test(int reptition, int warm_up,void (*test_func)(void *),void * arg) +{ + uint32_t t0,t1; + uint32_t freq; + uint32_t cycle; + uint64_t total_time=0; + uint32_t loop_count=0; + int i; + + if(warm_up) + test_func(arg); + + + freq=read_32bit_sysreg(CNTFRQ_EL0); + + start_pmu_event(e[0]); + start_pmu_event(e[1]); + start_pmu_event(e[2]); + start_pmu_event(e[3]); + start_pmu_event(e[4]); + start_pmu_event(e[5]); + + set_pmu_event_base(e[0]); + set_pmu_event_base(e[1]); + set_pmu_event_base(e[2]); + set_pmu_event_base(e[3]); + set_pmu_event_base(e[4]); + set_pmu_event_base(e[5]); + + t0=read_32bit_sysreg(CNTVCT_EL0); + + for(i=0;i<reptition;i++) + { + test_func(arg); + + record_pmu_event(e[0],0,1,1); + record_pmu_event(e[1],0,1,1); + record_pmu_event(e[2],0,1,1); + record_pmu_event(e[3],0,1,1); + record_pmu_event(e[4],0,1,1); + record_pmu_event(e[5],0,1,1); + + t1=read_32bit_sysreg(CNTVCT_EL0); + loop_count++; + total_time+=(t1-t0); + t0=t1; + + } + + + stop_pmu_event(e[0]); + stop_pmu_event(e[1]); + stop_pmu_event(e[2]); + stop_pmu_event(e[3]); + stop_pmu_event(e[4]); + stop_pmu_event(e[5]); + + dump_pmu_event_stat(e[0]); + dump_pmu_event_stat(e[1]); + dump_pmu_event_stat(e[2]); + dump_pmu_event_stat(e[3]); + dump_pmu_event_stat(e[4]); + dump_pmu_event_stat(e[5]); + + + printf("\n------------------------------------\n\n"); + + + cycle=get_pmu_stat_avg(e[1]); + t0=total_time/loop_count; + + + printf("freq is 0x%x\n",freq); + printf("pysical counter pass: 0x%x (0x%lx/%u)\n",t0,total_time,loop_count); + printf("coverted to ms: %.3f\n",1000.0*t0/freq); + + + printf("CPU freq: %.2f MHZ (cycle:0x%x)\n",(float)freq*cycle/t0/1000000,cycle); + + printf("IPC is: %.2f \n",(float)get_pmu_stat_avg(e[0])/cycle); + printf("L1 CACHE MISS is: %.2f \n",(float)get_pmu_stat_avg(e[2])/get_pmu_stat_avg(e[3])); + printf("L2 CACHE MISS is: %.2f \n",(float)get_pmu_stat_avg(e[4])/get_pmu_stat_avg(e[5])); + + /*reset all record */ + +} + +void release_testbed(void) +{ + + release_pmu_event(e[0]); + release_pmu_event(e[1]); + release_pmu_event(e[2]); + release_pmu_event(e[3]); + release_pmu_event(e[4]); + release_pmu_event(e[5]); +} diff --git a/unit_tests/testbed.h b/unit_tests/testbed.h new file mode 100644 index 00000000..7a7f3df1 --- /dev/null +++ b/unit_tests/testbed.h @@ -0,0 +1,10 @@ +#ifndef __TESTBED_H__ +#define __TESTBED_H__ + +void init_testbed(void); + +void run_test(int reptition, int warm_up,void (*test_func)(void *),void * arg); + +void release_testbed(void); + +#endif |