summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile49
-rw-r--r--Makefile.config.acl140
-rw-r--r--README.md11
-rw-r--r--cmake/Dependencies.cmake15
-rw-r--r--cmake/Modules/FindACL.cmake37
-rw-r--r--docs/caffeOnACL_release_notes_0_2_0.docxbin0 -> 180265 bytes
-rw-r--r--docs/caffeOnACL_user_guide_0_2_0.docxbin0 -> 471763 bytes
-rw-r--r--examples/cpp_classification/classification_profiling.cpp546
-rw-r--r--examples/cpp_classification/classification_profiling_gpu.cpp546
-rw-r--r--include/caffe/acl_layer.hpp246
-rw-r--r--include/caffe/caffe.hpp6
-rw-r--r--include/caffe/common.hpp12
-rw-r--r--include/caffe/layer.hpp92
-rw-r--r--include/caffe/layers/acl_absval_layer.hpp57
-rw-r--r--include/caffe/layers/acl_base_activation_layer.hpp56
-rw-r--r--include/caffe/layers/acl_bnll_layer.hpp57
-rw-r--r--include/caffe/layers/acl_conv_layer.hpp56
-rw-r--r--include/caffe/layers/acl_inner_product_layer.hpp54
-rw-r--r--include/caffe/layers/acl_lrn_layer.hpp54
-rw-r--r--include/caffe/layers/acl_pooling_layer.hpp54
-rw-r--r--include/caffe/layers/acl_relu_layer.hpp56
-rw-r--r--include/caffe/layers/acl_sigmoid_layer.hpp55
-rw-r--r--include/caffe/layers/acl_softmax_layer.hpp54
-rw-r--r--include/caffe/layers/acl_tanh_layer.hpp56
-rw-r--r--include/caffe/util/device_alternate.hpp4
-rw-r--r--include/caffe/util/hdf5.hpp2
-rw-r--r--src/caffe/acl_layer.cpp274
-rw-r--r--src/caffe/common.cpp12
-rw-r--r--src/caffe/layer.cpp74
-rw-r--r--src/caffe/layer_factory.cpp62
-rw-r--r--src/caffe/layers/absval_layer.cpp2
-rw-r--r--src/caffe/layers/acl_absval_layer.cpp64
-rw-r--r--src/caffe/layers/acl_base_activation_layer.cpp97
-rw-r--r--src/caffe/layers/acl_bnll_layer.cpp61
-rw-r--r--src/caffe/layers/acl_conv_layer.cpp218
-rw-r--r--src/caffe/layers/acl_inner_product_layer.cpp131
-rw-r--r--src/caffe/layers/acl_lrn_layer.cpp144
-rw-r--r--src/caffe/layers/acl_pooling_layer.cpp150
-rw-r--r--src/caffe/layers/acl_relu_layer.cpp70
-rw-r--r--src/caffe/layers/acl_sigmoid_layer.cpp61
-rw-r--r--src/caffe/layers/acl_softmax_layer.cpp117
-rw-r--r--src/caffe/layers/acl_tanh_layer.cpp63
-rw-r--r--src/caffe/layers/bnll_layer.cpp2
-rw-r--r--src/caffe/layers/hdf5_data_layer.cpp2
-rw-r--r--src/caffe/layers/hdf5_data_layer.cu2
-rw-r--r--src/caffe/layers/hdf5_output_layer.cpp2
-rw-r--r--src/caffe/layers/hdf5_output_layer.cu2
-rw-r--r--src/caffe/layers/inner_product_layer.cpp2
-rw-r--r--src/caffe/net.cpp16
-rw-r--r--src/caffe/solvers/sgd_solver.cpp10
-rw-r--r--src/caffe/syncedmem.cpp19
-rw-r--r--src/caffe/test/test_hdf5_output_layer.cpp2
-rw-r--r--src/caffe/test/test_hdf5data_layer.cpp2
-rw-r--r--src/caffe/util/hdf5.cpp2
-rw-r--r--src/caffe/util/math_functions.cpp4
-rw-r--r--unit_tests/Makefile87
-rw-r--r--unit_tests/pmu.c376
-rw-r--r--unit_tests/pmu.h130
-rw-r--r--unit_tests/prof_convolution_layer.cpp302
-rw-r--r--unit_tests/sgemm.cpp74
-rw-r--r--unit_tests/test.cpp37
-rw-r--r--unit_tests/test_caffe_main.cpp34
-rw-r--r--unit_tests/test_common.cpp64
-rw-r--r--unit_tests/test_convolution_layer.cpp888
-rw-r--r--unit_tests/test_fail.cpp419
-rw-r--r--unit_tests/test_inner_product_layer.cpp295
-rw-r--r--unit_tests/test_lrn_layer.cpp344
-rw-r--r--unit_tests/test_neuron_layer.cpp358
-rw-r--r--unit_tests/test_pooling_layer.cpp652
-rw-r--r--unit_tests/test_softmax_layer.cpp99
-rw-r--r--unit_tests/testbed.c146
-rw-r--r--unit_tests/testbed.h10
72 files changed, 8287 insertions, 10 deletions
diff --git a/Makefile b/Makefile
index 77900b69..2d5d3058 100644
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,12 @@ $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example.)
endif
include $(CONFIG_FILE)
+ifeq ($(CPU_ONLY),1)
+ USE_CUDA := 0
+endif
+ifeq ($(USE_ACL),1)
+ USE_CUDA := 0
+endif
BUILD_DIR_LINK := $(BUILD_DIR)
ifeq ($(RELEASE_BUILD_DIR),)
RELEASE_BUILD_DIR := .$(BUILD_DIR)_release
@@ -172,13 +178,13 @@ endif
CUDA_LIB_DIR += $(CUDA_DIR)/lib
INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
-ifneq ($(CPU_ONLY), 1)
+ifeq ($(USE_CUDA), 1)
INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
LIBRARY_DIRS += $(CUDA_LIB_DIR)
LIBRARIES := cudart cublas curand
endif
-LIBRARIES += glog gflags protobuf boost_system boost_filesystem m hdf5_hl hdf5
+LIBRARIES += glog gflags protobuf boost_system boost_filesystem m
# handle IO dependencies
USE_LEVELDB ?= 1
@@ -271,7 +277,7 @@ endif
# libstdc++ for NVCC compatibility on OS X >= 10.9 with CUDA < 7.0
ifeq ($(OSX), 1)
CXX := /usr/bin/clang++
- ifneq ($(CPU_ONLY), 1)
+ ifeq ($(USE_CUDA), 1)
CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | tr -d '[a-z ]')
ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1)
CXXFLAGS += -stdlib=libstdc++
@@ -296,6 +302,10 @@ ifeq ($(OSX), 1)
ORIGIN := @loader_path
VERSIONFLAGS += -Wl,-install_name,@rpath/$(DYNAMIC_VERSIONED_NAME_SHORT) -Wl,-rpath,$(ORIGIN)/../../build/lib
else
+ ifeq (${USE_OPENMP}, 1)
+ CXXFLAGS += -fopenmp
+ LINKFLAGS += -fopenmp
+ endif
ORIGIN := \$$ORIGIN
endif
@@ -334,6 +344,27 @@ ifeq ($(USE_NCCL), 1)
COMMON_FLAGS += -DUSE_NCCL
endif
+# ACL acceleration configuration
+ifeq ($(USE_ACL), 1)
+ LIBRARY_DIRS += $(ACL_LIBS_DIR)
+ LIBRARIES += $(ACL_LIBS)
+ INCLUDE_DIRS +=$(ACL_INCS)
+ COMMON_FLAGS += -DUSE_ACL -std=c++11
+endif
+
+#USE_PROFILING -- get profiling informations, is controled by LOGACL
+#LAYER_PERF_STAT -- haitao's net profiling information
+ifeq ($(USE_PROFILING), 1)
+ COMMON_FLAGS += -DUSE_PROFILING -DLAYER_PERF_STAT
+endif
+#HDF5
+ifeq ($(USE_HDF5), 1)
+ LIBRARY_DIRS += $(HDF5_LIBRARY_DIRS)
+ LIBRARIES += $(HDF5_LIBRARIES)
+ INCLUDE_DIRS +=$(HDF5_INCLUDE_DIRS)
+ COMMON_FLAGS += -DUSE_HDF5
+endif
+
# configure IO libraries
ifeq ($(USE_OPENCV), 1)
COMMON_FLAGS += -DUSE_OPENCV
@@ -358,6 +389,15 @@ ifeq ($(CPU_ONLY), 1)
COMMON_FLAGS += -DCPU_ONLY
endif
+ifeq ($(USE_ACL), 1)
+ OBJS := $(PROTO_OBJS) $(CXX_OBJS)
+ TEST_OBJS := $(TEST_CXX_OBJS)
+ TEST_BINS := $(TEST_CXX_BINS)
+ ALL_WARNS := $(ALL_CXX_WARNS)
+ TEST_FILTER := --gtest_filter="-*GPU*"
+ COMMON_FLAGS += -DCPU_ONLY
+endif
+
# Python layer support
ifeq ($(WITH_PYTHON_LAYER), 1)
COMMON_FLAGS += -DWITH_PYTHON_LAYER
@@ -365,7 +405,8 @@ ifeq ($(WITH_PYTHON_LAYER), 1)
endif
# BLAS configuration (default = ATLAS)
-BLAS ?= atlas
+#BLAS ?= atlas
+BLAS ?= open
ifeq ($(BLAS), mkl)
# MKL
LIBRARIES += mkl_rt
diff --git a/Makefile.config.acl b/Makefile.config.acl
new file mode 100644
index 00000000..b30759fb
--- /dev/null
+++ b/Makefile.config.acl
@@ -0,0 +1,140 @@
+## Refer to http://caffe.berkeleyvision.org/installation.html
+# Contributions simplifying and improving our build system are welcome!
+
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+# USE_CUDNN := 1
+
+# CPU-only switch (uncomment to build without GPU support).
+CPU_ONLY := 1
+
+USE_PROFILING := 0
+
+USE_ACL :=1
+ACL_ROOT :=/home/firefly/ComputeLibrary
+ACL_INCS :=$(ACL_ROOT)/include
+ACL_INCS +=$(ACL_ROOT)
+ACL_LIBS_DIR :=$(ACL_ROOT)/build
+ACL_LIBS_DIR +=$(ACL_ROOT)/build/arm_compute
+ACL_LIBS :=arm_compute OpenCL
+
+# uncomment to disable IO dependencies and corresponding data layers
+# USE_OPENCV := 0
+# USE_LEVELDB := 0
+# USE_LMDB := 0
+
+# uncomment to allow MDB_NOLOCK when reading LMDB files (only if necessary)
+# You should not set this flag if you will be reading LMDBs with any
+# possibility of simultaneous read and write
+# ALLOW_LMDB_NOLOCK := 1
+
+# Uncomment if you're using OpenCV 3
+# OPENCV_VERSION := 3
+
+# To customize your choice of compiler, uncomment and set the following.
+# N.B. the default for Linux is g++ and the default for OSX is clang++
+# CUSTOM_CXX := g++
+#CUSTOM_CXX := aarch64-linux-gnu-g++
+#os :=linux
+#arch :=arm64-v8a
+
+# CUDA directory contains bin/ and lib/ directories that we need.
+CUDA_DIR := /usr/local/cuda
+# On Ubuntu 14.04, if cuda tools are installed via
+# "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
+# CUDA_DIR := /usr
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility.
+# For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility.
+CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+ -gencode arch=compute_20,code=sm_21 \
+ -gencode arch=compute_30,code=sm_30 \
+ -gencode arch=compute_35,code=sm_35 \
+ -gencode arch=compute_50,code=sm_50 \
+ -gencode arch=compute_52,code=sm_52 \
+ -gencode arch=compute_60,code=sm_60 \
+ -gencode arch=compute_61,code=sm_61 \
+ -gencode arch=compute_61,code=compute_61
+
+# BLAS choice:
+# atlas for ATLAS (default)
+# mkl for MKL
+# open for OpenBlas
+#BLAS := atlas
+BLAS := open
+# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
+# Leave commented to accept the defaults for your choice of BLAS
+# (which should work)!
+# BLAS_INCLUDE := /path/to/your/blas
+# BLAS_LIB := /path/to/your/blas
+
+# Homebrew puts openblas in a directory that is not on the standard search path
+# BLAS_INCLUDE := $(shell brew --prefix openblas)/include
+# BLAS_LIB := $(shell brew --prefix openblas)/lib
+
+# This is required only if you will compile the matlab interface.
+# MATLAB directory should contain the mex binary in /bin.
+# MATLAB_DIR := /usr/local
+# MATLAB_DIR := /Applications/MATLAB_R2012b.app
+
+# NOTE: this is required only if you will compile the python interface.
+# We need to be able to find Python.h and numpy/arrayobject.h.
+PYTHON_INCLUDE := /usr/include/python2.7 \
+ /usr/lib/python2.7/dist-packages/numpy/core/include
+# Anaconda Python distribution is quite popular. Include path:
+# Verify anaconda location, sometimes it's in root.
+# ANACONDA_HOME := $(HOME)/anaconda
+# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
+ # $(ANACONDA_HOME)/include/python2.7 \
+ # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include
+
+# Uncomment to use Python 3 (default is Python 2)
+# PYTHON_LIBRARIES := boost_python3 python3.5m
+# PYTHON_INCLUDE := /usr/include/python3.5m \
+# /usr/lib/python3.5/dist-packages/numpy/core/include
+
+# We need to be able to find libpythonX.X.so or .dylib.
+PYTHON_LIB := /usr/lib
+# PYTHON_LIB := $(ANACONDA_HOME)/lib
+
+# Homebrew installs numpy in a non standard path (keg only)
+# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include
+# PYTHON_LIB += $(shell brew --prefix numpy)/lib
+
+# Uncomment to support layers written in Python (will link against Python libs)
+# WITH_PYTHON_LAYER := 1
+
+# Whatever else you find you need goes here.
+INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
+LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
+
+# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies
+# INCLUDE_DIRS += $(shell brew --prefix)/include
+# LIBRARY_DIRS += $(shell brew --prefix)/lib
+
+# NCCL acceleration switch (uncomment to build with NCCL)
+# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0)
+# USE_NCCL := 1
+
+# Uncomment to use `pkg-config` to specify OpenCV library paths.
+# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
+# USE_PKG_CONFIG := 1
+
+# N.B. both build and distribute dirs are cleared on `make clean`
+BUILD_DIR := build
+DISTRIBUTE_DIR := distribute
+
+#HDF5
+USE_HDF5 := 1
+HDF5_INCLUDE_DIRS := /usr/include/hdf5/serial
+HDF5_LIBRARY_DIRS := /usr/lib/aarch64-linux-gnu/hdf5/serial
+HDF5_LIBRARIES :=hdf5_hl hdf5
+
+# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
+# DEBUG := 1
+
+# The ID of the GPU that 'make runtest' will use to run unit tests.
+TEST_GPUID := 0
+
+# enable pretty build (comment to see full commands)
+Q ?= @
diff --git a/README.md b/README.md
index 44b9e62c..a20099a9 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,14 @@
+# CaffeOnACL
+[![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
+
+CaffeOnACL is a project to use ARM Compute Library (NEON+GPU) to speed up caffe and provide utilities to debug, profile and tune application performance.
+
+Check out the documents for the details like
+- [release notes](https://github.com/OAID/caffeOnACL/tree/master/docs/caffeOnACL_release_notes_0_2_0.docx)
+- [user guide](https://github.com/OAID/caffeOnACL/tree/master/docs/caffeOnACL_user_guide_0_2_0.docx)
+
+
+
# Caffe
[![Build Status](https://travis-ci.org/BVLC/caffe.svg?branch=master)](https://travis-ci.org/BVLC/caffe)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 02c81525..888443d0 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -66,6 +66,21 @@ if(USE_LEVELDB)
list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_LEVELDB)
endif()
+# ---[ ACL
+if(USE_ACL)
+ find_package(ACL REQUIRED)
+ list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${ACL_INCLUDE})
+ list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${ACL_INCLUDE}/include)
+ set(__list ${ACL_LIBRARIES})
+ separate_arguments(__list)
+ list(REMOVE_DUPLICATES __list)
+ foreach(i ${__list})
+ list(APPEND Caffe_LINKER_LIBS PUBLIC ${i})
+ endforeach()
+ list(APPEND Caffe_COMPILE_OPTIONS PRIVATE -std=c++11)
+ list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_ACL)
+endif()
+
# ---[ Snappy
if(USE_LEVELDB)
find_package(Snappy REQUIRED)
diff --git a/cmake/Modules/FindACL.cmake b/cmake/Modules/FindACL.cmake
new file mode 100644
index 00000000..d7df5aaa
--- /dev/null
+++ b/cmake/Modules/FindACL.cmake
@@ -0,0 +1,37 @@
+set(ACL_INC_PATHS
+ /usr/include
+ /usr/local/include
+ /usr/local/acl
+ $ENV{ACL_DIR}/include
+ )
+
+set(ACL_LIB_PATHS
+ /lib
+ /lib64
+ /usr/lib
+ /usr/lib64
+ /usr/local/lib
+ /usr/local/lib64
+ /usr/local/acl/lib
+ /usr/local/acl/lib64
+ $ENV{ACL_DIR}/lib
+ )
+
+find_path(ACL_INCLUDE NAMES arm_compute PATHS ${ACL_INC_PATHS})
+find_library(ACL_LIBRARIES NAMES arm_compute-static PATHS ${ACL_LIB_PATHS})
+find_library(ACL_CORE_LIBRARIES NAMES arm_compute_core-static PATHS ${ACL_LIB_PATHS})
+SET(ACL_LIBRARIES "${ACL_CORE_LIBRARIES} ${ACL_LIBRARIES}")
+
+if(ACL_INCS)
+ SET(ACL_INCLUDE "${ACL_INCS}")
+ SET(ACL_LIBRARIES "${ACL_LIBS}")
+ SET(ACL_FOUND 1)
+else ()
+ include(FindPackageHandleStandardArgs)
+ find_package_handle_standard_args(ACL DEFAULT_MSG ACL_INCLUDE ACL_LIBRARIES)
+endif ()
+
+if (ACL_FOUND)
+ message(STATUS "Found ACL (include: ${ACL_INCLUDE}, library: ${ACL_LIBRARIES})")
+ mark_as_advanced(ACL_INCLUDE ACL_LIBRARIES)
+endif ()
diff --git a/docs/caffeOnACL_release_notes_0_2_0.docx b/docs/caffeOnACL_release_notes_0_2_0.docx
new file mode 100644
index 00000000..0c1c0141
--- /dev/null
+++ b/docs/caffeOnACL_release_notes_0_2_0.docx
Binary files differ
diff --git a/docs/caffeOnACL_user_guide_0_2_0.docx b/docs/caffeOnACL_user_guide_0_2_0.docx
new file mode 100644
index 00000000..a7ebf61d
--- /dev/null
+++ b/docs/caffeOnACL_user_guide_0_2_0.docx
Binary files differ
diff --git a/examples/cpp_classification/classification_profiling.cpp b/examples/cpp_classification/classification_profiling.cpp
new file mode 100644
index 00000000..f5d5eaed
--- /dev/null
+++ b/examples/cpp_classification/classification_profiling.cpp
@@ -0,0 +1,546 @@
+#include <caffe/caffe.hpp>
+#ifdef USE_OPENCV
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif // USE_OPENCV
+#include <algorithm>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_PROFILING
+
+#include <iostream>
+
+#include <time.h>
+
+#define REPEAT_TEST
+
+unsigned long get_cur_time(void)
+{
+ struct timespec tm;
+
+ clock_gettime(CLOCK_MONOTONIC_COARSE, &tm);
+
+ return (tm.tv_sec*1000+tm.tv_nsec/1000000);
+}
+
+#endif //USE_PROFILING
+
+#ifdef USE_OPENCV
+using namespace caffe; // NOLINT(build/namespaces)
+using std::string;
+
+/* Pair (label, confidence) representing a prediction. */
+typedef std::pair<string, float> Prediction;
+
+class Classifier {
+ public:
+ Classifier(const string& model_file,
+ const string& trained_file,
+ const string& mean_file,
+ const string& label_file);
+
+ std::vector<Prediction> Classify(const cv::Mat& img, int N = 5);
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+ void dump_perf_stat(void);
+ void dump_single_layer_io(int idx, Layer<float> * p_layer);
+ void dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time);
+#ifdef REPEAT_TEST
+ void collect_layer_stat(vector<vector<perf_stat> * > & all_stat);
+ void dump_all_stat(vector <vector<perf_stat>*>& all_stat);
+ void reset_layer_stat();
+#endif
+#endif
+
+#endif //USE_PROFILING
+
+ private:
+ void SetMean(const string& mean_file);
+
+ std::vector<float> Predict(const cv::Mat& img);
+
+ void WrapInputLayer(std::vector<cv::Mat>* input_channels);
+
+ void Preprocess(const cv::Mat& img,
+ std::vector<cv::Mat>* input_channels);
+
+ private:
+ shared_ptr<Net<float> > net_;
+ cv::Size input_geometry_;
+ int num_channels_;
+ cv::Mat mean_;
+ std::vector<string> labels_;
+};
+
+Classifier::Classifier(const string& model_file,
+ const string& trained_file,
+ const string& mean_file,
+ const string& label_file) {
+#ifdef CPU_ONLY
+ Caffe::set_mode(Caffe::CPU);
+#else
+ Caffe::set_mode(Caffe::GPU);
+#endif
+
+ /* Load the network. */
+ net_.reset(new Net<float>(model_file, TEST));
+ net_->CopyTrainedLayersFrom(trained_file);
+
+ CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
+ CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
+
+ Blob<float>* input_layer = net_->input_blobs()[0];
+ num_channels_ = input_layer->channels();
+ CHECK(num_channels_ == 3 || num_channels_ == 1)
+ << "Input layer should have 1 or 3 channels.";
+ input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
+
+ /* Load the binaryproto mean file. */
+ SetMean(mean_file);
+
+ /* Load labels. */
+ std::ifstream labels(label_file.c_str());
+ CHECK(labels) << "Unable to open labels file " << label_file;
+ string line;
+ while (std::getline(labels, line))
+ labels_.push_back(string(line));
+
+ Blob<float>* output_layer = net_->output_blobs()[0];
+ CHECK_EQ(labels_.size(), output_layer->channels())
+ << "Number of labels is different from the output layer dimension.";
+}
+
+static bool PairCompare(const std::pair<float, int>& lhs,
+ const std::pair<float, int>& rhs) {
+ return lhs.first > rhs.first;
+}
+
+/* Return the indices of the top N values of vector v. */
+static std::vector<int> Argmax(const std::vector<float>& v, int N) {
+ std::vector<std::pair<float, int> > pairs;
+ for (size_t i = 0; i < v.size(); ++i)
+ pairs.push_back(std::make_pair(v[i], i));
+ std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+ std::vector<int> result;
+ for (int i = 0; i < N; ++i)
+ result.push_back(pairs[i].second);
+ return result;
+}
+
+/* Return the top N predictions. */
+std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) {
+ std::vector<float> output = Predict(img);
+
+ N = std::min<int>(labels_.size(), N);
+ std::vector<int> maxN = Argmax(output, N);
+ std::vector<Prediction> predictions;
+ for (int i = 0; i < N; ++i) {
+ int idx = maxN[i];
+ predictions.push_back(std::make_pair(labels_[idx], output[idx]));
+ }
+
+ return predictions;
+}
+
+/* Load the mean file in binaryproto format. */
+void Classifier::SetMean(const string& mean_file) {
+ BlobProto blob_proto;
+ ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+
+ /* Convert from BlobProto to Blob<float> */
+ Blob<float> mean_blob;
+ mean_blob.FromProto(blob_proto);
+ CHECK_EQ(mean_blob.channels(), num_channels_)
+ << "Number of channels of mean file doesn't match input layer.";
+
+ /* The format of the mean file is planar 32-bit float BGR or grayscale. */
+ std::vector<cv::Mat> channels;
+ float* data = mean_blob.mutable_cpu_data();
+ for (int i = 0; i < num_channels_; ++i) {
+ /* Extract an individual channel. */
+ cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
+ channels.push_back(channel);
+ data += mean_blob.height() * mean_blob.width();
+ }
+
+ /* Merge the separate channels into a single image. */
+ cv::Mat mean;
+ cv::merge(channels, mean);
+
+ /* Compute the global mean pixel value and create a mean image
+ * filled with this value. */
+ cv::Scalar channel_mean = cv::mean(mean);
+ mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean);
+}
+
+std::vector<float> Classifier::Predict(const cv::Mat& img) {
+ Blob<float>* input_layer = net_->input_blobs()[0];
+ input_layer->Reshape(1, num_channels_,
+ input_geometry_.height, input_geometry_.width);
+ /* Forward dimension change to all layers. */
+ net_->Reshape();
+
+ std::vector<cv::Mat> input_channels;
+ WrapInputLayer(&input_channels);
+
+ Preprocess(img, &input_channels);
+
+#ifdef USE_PROFILING
+ unsigned long tstart=get_cur_time();
+#endif //USE_PROFILING
+
+ net_->Forward();
+
+#ifdef USE_PROFILING
+
+ unsigned long tend=get_cur_time();
+
+ std::cout<<"used time: "<<tend-tstart<<std::endl;
+
+#ifdef LAYER_PERF_STAT
+ dump_perf_stat();
+#ifdef REPEAT_TEST
+
+ reset_layer_stat();
+
+ vector<vector<perf_stat>* > all_stat;
+ int rep_number=10;
+
+ for(int i=0;i<rep_number;i++)
+ {
+ net_->Forward();
+ collect_layer_stat(all_stat);
+ reset_layer_stat();
+ }
+
+ //dump stats
+ dump_all_stat(all_stat);
+
+ for(int i=0;i<all_stat.size();i++)
+ delete all_stat[i];
+
+#endif //REPEAT_TEST
+#endif //LAYER_PERF_STAT
+#endif //USE_PROFILING
+
+ /* Copy the output layer to a std::vector */
+ Blob<float>* output_layer = net_->output_blobs()[0];
+ const float* begin = output_layer->cpu_data();
+ const float* end = begin + output_layer->channels();
+ return std::vector<float>(begin, end);
+}
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+
+#ifdef REPEAT_TEST
+void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat)
+{
+ vector<perf_stat > * p_stat;
+ perf_stat * p_time_stat;
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+
+ p_stat=new vector<perf_stat>;
+
+ for (int i =0;i< layers.size(); i++) {
+ p_time_stat=layers[i]->get_time_stat();
+ p_stat->push_back(*p_time_stat);
+
+ }
+
+ all_stat.push_back(p_stat);
+}
+
+void Classifier::reset_layer_stat(void)
+{
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+ perf_stat * p_time_stat;
+
+ for (int i =0;i< layers.size(); i++) {
+ p_time_stat=layers[i]->get_time_stat();
+
+ p_time_stat->count=0;
+ p_time_stat->total=0;
+ p_time_stat->used=p_time_stat->start=p_time_stat->end=0;
+ }
+}
+
+void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat)
+{
+
+ struct new_perf_stat {
+ perf_stat stat;
+ int idx;
+ };
+
+ vector<new_perf_stat > layer_stat;
+ perf_stat * p_stat;
+
+ uint64_t total_time=0;
+
+ layer_stat.resize(all_stat[0]->size());
+
+ for(int i=0;i<all_stat.size();i++)
+ {
+ for(int j=0;j<layer_stat.size();j++)
+ {
+ p_stat=&layer_stat[j].stat;
+
+ p_stat->total+=(*all_stat[i])[j].total;
+ p_stat->count+=(*all_stat[i])[j].count;
+ total_time+=(*all_stat[i])[j].total;
+ }
+ }
+
+ total_time=total_time/all_stat.size();
+
+ std::cout<<std::endl<<"----------------------------------"<<std::endl;
+ std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl;
+ std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl;
+ std::cout<<"Each layer stats: ..."<<std::endl;
+
+
+ for(int i=layer_stat.size()-1;i>=0;i--)
+ {
+ p_stat=&layer_stat[i].stat;
+
+ layer_stat[i].idx=i;
+
+ std::cout<<" "<<i<<": used time: "<<p_stat->total/all_stat.size();
+ std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+ std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+ }
+
+ std::cout<<std::endl;
+
+ std::cout<<"time cost top 10 layers are: ..."<<std::endl;
+
+ std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b)
+ {
+ if(a.stat.total>b.stat.total)
+ return true;
+ else
+ return false;
+ });
+
+ uint64_t top_total_time=0;
+
+ for(int i=0; i<10; i++)
+ {
+ p_stat=&layer_stat[i].stat;
+
+ std::cout<<" "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size();
+ std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+ std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+ top_total_time+=p_stat->total;
+ }
+
+ std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl;
+
+ std::cout<<std::endl;
+}
+
+#endif
+
+void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer)
+{
+ const LayerParameter& layer_param=p_layer->layer_param();
+
+ std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+ std::cout<<" type: "<<layer_param.type()<<std::endl;
+
+ const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom;
+
+ for(int i=0;i<layer_param.bottom_size(); i++)
+ {
+ std::cout<<"bottom "<<layer_param.bottom(i)<<": ";
+
+ Blob<float> * p_blob=(*p_bottom_vec)[i];
+
+ for(int j=0;j<p_blob->num_axes();j++)
+ {
+ std::cout<<p_blob->shape(j)<<" ";
+ }
+ std::cout<<std::endl;
+ }
+
+ const vector<Blob<float>*> *p_top_vec=p_layer->saved_top;
+ for(int i=0;i<layer_param.top_size(); i++)
+ {
+ std::cout<<"top "<<layer_param.top(i)<<": ";
+ Blob<float> * p_blob=(*p_top_vec)[i];
+
+ for(int j=0;j<p_blob->num_axes();j++)
+ {
+ std::cout<<p_blob->shape(j)<<" ";
+ }
+ std::cout<<std::endl;
+ }
+}
+
+void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time)
+{
+ const LayerParameter& layer_param=p_layer->layer_param();
+ perf_stat * p_time_stat;
+
+ p_time_stat=p_layer->get_time_stat();
+
+ std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+ std::cout<<" type: "<<layer_param.type();
+ std::cout<<" ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl;
+
+
+ std::cout<<"time stat: total: "<<p_time_stat->total<<" count: "<<p_time_stat->count;
+ if(p_time_stat->count)
+ {
+ std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count;
+ }
+
+ std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end;
+ std::cout<<std::endl;
+
+
+}
+
+void Classifier::dump_perf_stat(void)
+{
+ uint64_t total_net_time=0;
+
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+ std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl;
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+ dump_single_layer_io(i,layers[i].get());
+ }
+
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+
+ perf_stat * p_time_stat;
+
+ p_time_stat=layers[i]->get_time_stat();
+
+ total_net_time+=p_time_stat->total;
+
+ }
+
+ std::cout<<"Time for each layer ... sum of all layers is : ";
+ std::cout<<total_net_time<<std::endl;
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+
+ dump_single_layer_perf(i,layers[i].get(),total_net_time);
+ }
+
+}
+
+#endif
+
+#endif //USE_PROFILING
+
+/* Wrap the input layer of the network in separate cv::Mat objects
+ * (one per channel). This way we save one memcpy operation and we
+ * don't need to rely on cudaMemcpy2D. The last preprocessing
+ * operation will write the separate channels directly to the input
+ * layer. */
+void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
+ Blob<float>* input_layer = net_->input_blobs()[0];
+
+ int width = input_layer->width();
+ int height = input_layer->height();
+ float* input_data = input_layer->mutable_cpu_data();
+ for (int i = 0; i < input_layer->channels(); ++i) {
+ cv::Mat channel(height, width, CV_32FC1, input_data);
+ input_channels->push_back(channel);
+ input_data += width * height;
+ }
+}
+
+void Classifier::Preprocess(const cv::Mat& img,
+ std::vector<cv::Mat>* input_channels) {
+ /* Convert the input image to the input image format of the network. */
+ cv::Mat sample;
+ if (img.channels() == 3 && num_channels_ == 1)
+ cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
+ else if (img.channels() == 4 && num_channels_ == 1)
+ cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
+ else if (img.channels() == 4 && num_channels_ == 3)
+ cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
+ else if (img.channels() == 1 && num_channels_ == 3)
+ cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
+ else
+ sample = img;
+
+ cv::Mat sample_resized;
+ if (sample.size() != input_geometry_)
+ cv::resize(sample, sample_resized, input_geometry_);
+ else
+ sample_resized = sample;
+
+ cv::Mat sample_float;
+ if (num_channels_ == 3)
+ sample_resized.convertTo(sample_float, CV_32FC3);
+ else
+ sample_resized.convertTo(sample_float, CV_32FC1);
+
+ cv::Mat sample_normalized;
+ cv::subtract(sample_float, mean_, sample_normalized);
+
+ /* This operation will write the separate BGR planes directly to the
+ * input layer of the network because it is wrapped by the cv::Mat
+ * objects in input_channels. */
+ cv::split(sample_normalized, *input_channels);
+
+ CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
+ == net_->input_blobs()[0]->cpu_data())
+ << "Input channels are not wrapping the input layer of the network.";
+}
+
+int main(int argc, char** argv) {
+ if (argc != 6) {
+ std::cerr << "Usage: " << argv[0]
+ << " deploy.prototxt network.caffemodel"
+ << " mean.binaryproto labels.txt img.jpg" << std::endl;
+ return 1;
+ }
+
+ ::google::InitGoogleLogging(argv[0]);
+
+ string model_file = argv[1];
+ string trained_file = argv[2];
+ string mean_file = argv[3];
+ string label_file = argv[4];
+ Classifier classifier(model_file, trained_file, mean_file, label_file);
+
+ string file = argv[5];
+
+ std::cout << "---------- Prediction for "
+ << file << " ----------" << std::endl;
+
+ cv::Mat img = cv::imread(file, -1);
+ CHECK(!img.empty()) << "Unable to decode image " << file;
+ std::vector<Prediction> predictions = classifier.Classify(img);
+
+ /* Print the top N predictions. */
+ for (size_t i = 0; i < predictions.size(); ++i) {
+ Prediction p = predictions[i];
+ std::cout << std::fixed << std::setprecision(4) << p.second << " - \""
+ << p.first << "\"" << std::endl;
+ }
+}
+#else
+int main(int argc, char** argv) {
+ LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
+}
+#endif // USE_OPENCV
diff --git a/examples/cpp_classification/classification_profiling_gpu.cpp b/examples/cpp_classification/classification_profiling_gpu.cpp
new file mode 100644
index 00000000..3c5e04ad
--- /dev/null
+++ b/examples/cpp_classification/classification_profiling_gpu.cpp
@@ -0,0 +1,546 @@
+#include <caffe/caffe.hpp>
+#ifdef USE_OPENCV
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif // USE_OPENCV
+#include <algorithm>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_PROFILING
+
+#include <iostream>
+
+#include <time.h>
+
+#define REPEAT_TEST
+
+unsigned long get_cur_time(void)
+{
+ struct timespec tm;
+
+ clock_gettime(CLOCK_MONOTONIC_COARSE, &tm);
+
+ return (tm.tv_sec*1000+tm.tv_nsec/1000000);
+}
+
+#endif //USE_PROFILING
+
+#ifdef USE_OPENCV
+using namespace caffe; // NOLINT(build/namespaces)
+using std::string;
+
+/* Pair (label, confidence) representing a prediction. */
+typedef std::pair<string, float> Prediction;
+
+class Classifier {
+ public:
+ Classifier(const string& model_file,
+ const string& trained_file,
+ const string& mean_file,
+ const string& label_file);
+
+ std::vector<Prediction> Classify(const cv::Mat& img, int N = 5);
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+ void dump_perf_stat(void);
+ void dump_single_layer_io(int idx, Layer<float> * p_layer);
+ void dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time);
+#ifdef REPEAT_TEST
+ void collect_layer_stat(vector<vector<perf_stat> * > & all_stat);
+ void dump_all_stat(vector <vector<perf_stat>*>& all_stat);
+ void reset_layer_stat();
+#endif
+#endif
+
+#endif //USE_PROFILING
+
+ private:
+ void SetMean(const string& mean_file);
+
+ std::vector<float> Predict(const cv::Mat& img);
+
+ void WrapInputLayer(std::vector<cv::Mat>* input_channels);
+
+ void Preprocess(const cv::Mat& img,
+ std::vector<cv::Mat>* input_channels);
+
+ private:
+ shared_ptr<Net<float> > net_;
+ cv::Size input_geometry_;
+ int num_channels_;
+ cv::Mat mean_;
+ std::vector<string> labels_;
+};
+
+Classifier::Classifier(const string& model_file,
+ const string& trained_file,
+ const string& mean_file,
+ const string& label_file) {
+//#ifdef CPU_ONLY
+// Caffe::set_mode(Caffe::CPU);
+//#else
+ Caffe::set_mode(Caffe::GPU); //For ARM GPU (the code is in CPU_ONLY mode, just set caffe mode to GPU)
+//#endif
+
+ /* Load the network. */
+ net_.reset(new Net<float>(model_file, TEST));
+ net_->CopyTrainedLayersFrom(trained_file);
+
+ CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
+ CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
+
+ Blob<float>* input_layer = net_->input_blobs()[0];
+ num_channels_ = input_layer->channels();
+ CHECK(num_channels_ == 3 || num_channels_ == 1)
+ << "Input layer should have 1 or 3 channels.";
+ input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
+
+ /* Load the binaryproto mean file. */
+ SetMean(mean_file);
+
+ /* Load labels. */
+ std::ifstream labels(label_file.c_str());
+ CHECK(labels) << "Unable to open labels file " << label_file;
+ string line;
+ while (std::getline(labels, line))
+ labels_.push_back(string(line));
+
+ Blob<float>* output_layer = net_->output_blobs()[0];
+ CHECK_EQ(labels_.size(), output_layer->channels())
+ << "Number of labels is different from the output layer dimension.";
+}
+
+static bool PairCompare(const std::pair<float, int>& lhs,
+ const std::pair<float, int>& rhs) {
+ return lhs.first > rhs.first;
+}
+
+/* Return the indices of the top N values of vector v. */
+static std::vector<int> Argmax(const std::vector<float>& v, int N) {
+ std::vector<std::pair<float, int> > pairs;
+ for (size_t i = 0; i < v.size(); ++i)
+ pairs.push_back(std::make_pair(v[i], i));
+ std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+ std::vector<int> result;
+ for (int i = 0; i < N; ++i)
+ result.push_back(pairs[i].second);
+ return result;
+}
+
+/* Return the top N predictions. */
+std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) {
+ std::vector<float> output = Predict(img);
+
+ N = std::min<int>(labels_.size(), N);
+ std::vector<int> maxN = Argmax(output, N);
+ std::vector<Prediction> predictions;
+ for (int i = 0; i < N; ++i) {
+ int idx = maxN[i];
+ predictions.push_back(std::make_pair(labels_[idx], output[idx]));
+ }
+
+ return predictions;
+}
+
+/* Load the mean file in binaryproto format. */
+void Classifier::SetMean(const string& mean_file) {
+ BlobProto blob_proto;
+ ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+
+ /* Convert from BlobProto to Blob<float> */
+ Blob<float> mean_blob;
+ mean_blob.FromProto(blob_proto);
+ CHECK_EQ(mean_blob.channels(), num_channels_)
+ << "Number of channels of mean file doesn't match input layer.";
+
+ /* The format of the mean file is planar 32-bit float BGR or grayscale. */
+ std::vector<cv::Mat> channels;
+ float* data = mean_blob.mutable_cpu_data();
+ for (int i = 0; i < num_channels_; ++i) {
+ /* Extract an individual channel. */
+ cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
+ channels.push_back(channel);
+ data += mean_blob.height() * mean_blob.width();
+ }
+
+ /* Merge the separate channels into a single image. */
+ cv::Mat mean;
+ cv::merge(channels, mean);
+
+ /* Compute the global mean pixel value and create a mean image
+ * filled with this value. */
+ cv::Scalar channel_mean = cv::mean(mean);
+ mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean);
+}
+
+std::vector<float> Classifier::Predict(const cv::Mat& img) {
+ Blob<float>* input_layer = net_->input_blobs()[0];
+ input_layer->Reshape(1, num_channels_,
+ input_geometry_.height, input_geometry_.width);
+ /* Forward dimension change to all layers. */
+ net_->Reshape();
+
+ std::vector<cv::Mat> input_channels;
+ WrapInputLayer(&input_channels);
+
+ Preprocess(img, &input_channels);
+
+#ifdef USE_PROFILING
+ unsigned long tstart=get_cur_time();
+#endif //USE_PROFILING
+
+ net_->Forward();
+
+#ifdef USE_PROFILING
+
+ unsigned long tend=get_cur_time();
+
+ std::cout<<"used time: "<<tend-tstart<<std::endl;
+
+#ifdef LAYER_PERF_STAT
+ dump_perf_stat();
+#ifdef REPEAT_TEST
+
+ reset_layer_stat();
+
+ vector<vector<perf_stat>* > all_stat;
+ int rep_number=10;
+
+ for(int i=0;i<rep_number;i++)
+ {
+ net_->Forward();
+ collect_layer_stat(all_stat);
+ reset_layer_stat();
+ }
+
+ //dump stats
+ dump_all_stat(all_stat);
+
+ for(int i=0;i<all_stat.size();i++)
+ delete all_stat[i];
+
+#endif //REPEAT_TEST
+#endif //LAYER_PERF_STAT
+#endif //USE_PROFILING
+
+ /* Copy the output layer to a std::vector */
+ Blob<float>* output_layer = net_->output_blobs()[0];
+ const float* begin = output_layer->cpu_data();
+ const float* end = begin + output_layer->channels();
+ return std::vector<float>(begin, end);
+}
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+
+#ifdef REPEAT_TEST
+void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat)
+{
+ vector<perf_stat > * p_stat;
+ perf_stat * p_time_stat;
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+
+ p_stat=new vector<perf_stat>;
+
+ for (int i =0;i< layers.size(); i++) {
+ p_time_stat=layers[i]->get_time_stat();
+ p_stat->push_back(*p_time_stat);
+
+ }
+
+ all_stat.push_back(p_stat);
+}
+
+void Classifier::reset_layer_stat(void)
+{
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+ perf_stat * p_time_stat;
+
+ for (int i =0;i< layers.size(); i++) {
+ p_time_stat=layers[i]->get_time_stat();
+
+ p_time_stat->count=0;
+ p_time_stat->total=0;
+ p_time_stat->used=p_time_stat->start=p_time_stat->end=0;
+ }
+}
+
+void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat)
+{
+
+ struct new_perf_stat {
+ perf_stat stat;
+ int idx;
+ };
+
+ vector<new_perf_stat > layer_stat;
+ perf_stat * p_stat;
+
+ uint64_t total_time=0;
+
+ layer_stat.resize(all_stat[0]->size());
+
+ for(int i=0;i<all_stat.size();i++)
+ {
+ for(int j=0;j<layer_stat.size();j++)
+ {
+ p_stat=&layer_stat[j].stat;
+
+ p_stat->total+=(*all_stat[i])[j].total;
+ p_stat->count+=(*all_stat[i])[j].count;
+ total_time+=(*all_stat[i])[j].total;
+ }
+ }
+
+ total_time=total_time/all_stat.size();
+
+ std::cout<<std::endl<<"----------------------------------"<<std::endl;
+ std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl;
+ std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl;
+ std::cout<<"Each layer stats: ..."<<std::endl;
+
+
+ for(int i=layer_stat.size()-1;i>=0;i--)
+ {
+ p_stat=&layer_stat[i].stat;
+
+ layer_stat[i].idx=i;
+
+ std::cout<<" "<<i<<": used time: "<<p_stat->total/all_stat.size();
+ std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+ std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+ }
+
+ std::cout<<std::endl;
+
+ std::cout<<"time cost top 10 layers are: ..."<<std::endl;
+
+ std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b)
+ {
+ if(a.stat.total>b.stat.total)
+ return true;
+ else
+ return false;
+ });
+
+ uint64_t top_total_time=0;
+
+ for(int i=0; i<10; i++)
+ {
+ p_stat=&layer_stat[i].stat;
+
+ std::cout<<" "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size();
+ std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+ std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+ top_total_time+=p_stat->total;
+ }
+
+ std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl;
+
+ std::cout<<std::endl;
+}
+
+#endif
+
+void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer)
+{
+ const LayerParameter& layer_param=p_layer->layer_param();
+
+ std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+ std::cout<<" type: "<<layer_param.type()<<std::endl;
+
+ const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom;
+
+ for(int i=0;i<layer_param.bottom_size(); i++)
+ {
+ std::cout<<"bottom "<<layer_param.bottom(i)<<": ";
+
+ Blob<float> * p_blob=(*p_bottom_vec)[i];
+
+ for(int j=0;j<p_blob->num_axes();j++)
+ {
+ std::cout<<p_blob->shape(j)<<" ";
+ }
+ std::cout<<std::endl;
+ }
+
+ const vector<Blob<float>*> *p_top_vec=p_layer->saved_top;
+ for(int i=0;i<layer_param.top_size(); i++)
+ {
+ std::cout<<"top "<<layer_param.top(i)<<": ";
+ Blob<float> * p_blob=(*p_top_vec)[i];
+
+ for(int j=0;j<p_blob->num_axes();j++)
+ {
+ std::cout<<p_blob->shape(j)<<" ";
+ }
+ std::cout<<std::endl;
+ }
+}
+
+void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time)
+{
+ const LayerParameter& layer_param=p_layer->layer_param();
+ perf_stat * p_time_stat;
+
+ p_time_stat=p_layer->get_time_stat();
+
+ std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+ std::cout<<" type: "<<layer_param.type();
+ std::cout<<" ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl;
+
+
+ std::cout<<"time stat: total: "<<p_time_stat->total<<" count: "<<p_time_stat->count;
+ if(p_time_stat->count)
+ {
+ std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count;
+ }
+
+ std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end;
+ std::cout<<std::endl;
+
+
+}
+
+void Classifier::dump_perf_stat(void)
+{
+ uint64_t total_net_time=0;
+
+ const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+ std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl;
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+ dump_single_layer_io(i,layers[i].get());
+ }
+
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+
+ perf_stat * p_time_stat;
+
+ p_time_stat=layers[i]->get_time_stat();
+
+ total_net_time+=p_time_stat->total;
+
+ }
+
+ std::cout<<"Time for each layer ... sum of all layers is : ";
+ std::cout<<total_net_time<<std::endl;
+
+ for (int i = layers.size() - 1; i >= 0; --i) {
+
+ dump_single_layer_perf(i,layers[i].get(),total_net_time);
+ }
+
+}
+
+#endif
+
+#endif //USE_PROFILING
+
+/* Wrap the input layer of the network in separate cv::Mat objects
+ * (one per channel). This way we save one memcpy operation and we
+ * don't need to rely on cudaMemcpy2D. The last preprocessing
+ * operation will write the separate channels directly to the input
+ * layer. */
+void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
+ Blob<float>* input_layer = net_->input_blobs()[0];
+
+ int width = input_layer->width();
+ int height = input_layer->height();
+ float* input_data = input_layer->mutable_cpu_data();
+ for (int i = 0; i < input_layer->channels(); ++i) {
+ cv::Mat channel(height, width, CV_32FC1, input_data);
+ input_channels->push_back(channel);
+ input_data += width * height;
+ }
+}
+
+void Classifier::Preprocess(const cv::Mat& img,
+ std::vector<cv::Mat>* input_channels) {
+ /* Convert the input image to the input image format of the network. */
+ cv::Mat sample;
+ if (img.channels() == 3 && num_channels_ == 1)
+ cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
+ else if (img.channels() == 4 && num_channels_ == 1)
+ cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
+ else if (img.channels() == 4 && num_channels_ == 3)
+ cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
+ else if (img.channels() == 1 && num_channels_ == 3)
+ cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
+ else
+ sample = img;
+
+ cv::Mat sample_resized;
+ if (sample.size() != input_geometry_)
+ cv::resize(sample, sample_resized, input_geometry_);
+ else
+ sample_resized = sample;
+
+ cv::Mat sample_float;
+ if (num_channels_ == 3)
+ sample_resized.convertTo(sample_float, CV_32FC3);
+ else
+ sample_resized.convertTo(sample_float, CV_32FC1);
+
+ cv::Mat sample_normalized;
+ cv::subtract(sample_float, mean_, sample_normalized);
+
+ /* This operation will write the separate BGR planes directly to the
+ * input layer of the network because it is wrapped by the cv::Mat
+ * objects in input_channels. */
+ cv::split(sample_normalized, *input_channels);
+
+ CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
+ == net_->input_blobs()[0]->cpu_data())
+ << "Input channels are not wrapping the input layer of the network.";
+}
+
+int main(int argc, char** argv) {
+ if (argc != 6) {
+ std::cerr << "Usage: " << argv[0]
+ << " deploy.prototxt network.caffemodel"
+ << " mean.binaryproto labels.txt img.jpg" << std::endl;
+ return 1;
+ }
+
+ ::google::InitGoogleLogging(argv[0]);
+
+ string model_file = argv[1];
+ string trained_file = argv[2];
+ string mean_file = argv[3];
+ string label_file = argv[4];
+ Classifier classifier(model_file, trained_file, mean_file, label_file);
+
+ string file = argv[5];
+
+ std::cout << "---------- Prediction for "
+ << file << " ----------" << std::endl;
+
+ cv::Mat img = cv::imread(file, -1);
+ CHECK(!img.empty()) << "Unable to decode image " << file;
+ std::vector<Prediction> predictions = classifier.Classify(img);
+
+ /* Print the top N predictions. */
+ for (size_t i = 0; i < predictions.size(); ++i) {
+ Prediction p = predictions[i];
+ std::cout << std::fixed << std::setprecision(4) << p.second << " - \""
+ << p.first << "\"" << std::endl;
+ }
+}
+#else
+int main(int argc, char** argv) {
+ LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
+}
+#endif // USE_OPENCV
diff --git a/include/caffe/acl_layer.hpp b/include/caffe/acl_layer.hpp
new file mode 100644
index 00000000..db9fee5c
--- /dev/null
+++ b/include/caffe/acl_layer.hpp
@@ -0,0 +1,246 @@
+#ifndef CAFFE_ACL_LAYER_HPP_
+#define CAFFE_ACL_LAYER_HPP_
+
+#ifdef USE_ACL
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+using namespace arm_compute;
+#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
+#define FLAGS_ENABLE_ACL_BNLL 0x00000002
+#define FLAGS_ENABLE_ACL_CONV 0x00000004
+#define FLAGS_ENABLE_ACL_FC 0x00000008
+#define FLAGS_ENABLE_ACL_LRN 0x00000010
+#define FLAGS_ENABLE_ACL_POOLING 0x00000020
+#define FLAGS_ENABLE_ACL_RELU 0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
+#define FLAGS_ENABLE_ACL_TANH 0x00000200
+extern unsigned int bypass_acl_class_layer;
+#endif
+#ifdef USE_PROFILING
+#include "layer.hpp"
+
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN 0x00000004
+#define MASK_LOG_CONFIG 0x00000008
+#define MASK_LOG_COPY 0x00000010
+#define MASK_LOG_ABSVAL 0x00000020
+#define MASK_LOG_BNLL 0x00000040
+#define MASK_LOG_CONV 0x00000080
+#define MASK_LOG_FC 0x00000100
+#define MASK_LOG_LRN 0x00000200
+#define MASK_LOG_POOLING 0x00000400
+#define MASK_LOG_RELU 0x00000800
+#define MASK_LOG_SIGMOID 0x00001000
+#define MASK_LOG_SOFTMAX 0x00002000
+#define MASK_LOG_TANH 0x00004000
+#define APP_TIME_INFO MASK_LOG_APP_TIME,"time: \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate: \t\t"
+#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t"
+#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t"
+#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+#endif //USE_PROFILING
+namespace caffe {
+#ifdef USE_ACL
+enum TensorType{
+ tensor_input,
+ tensor_output,
+ tensor_weights,
+ tensor_biases,
+};
+template <typename ACLTensor>
+class BaseTensor:public ACLTensor{
+public:
+ BaseTensor(bool share)
+ :share_(share),type_(tensor_input),allocate_(false){
+ }
+ virtual void bindmem(void *mem,bool share){
+ mem_=mem;
+ share_=share;
+ }
+ virtual void settensortype(TensorType type){
+ type_=type;
+ };
+ virtual void map(bool blocking = true){}
+ virtual void unmap(){}
+ virtual void commit();
+ int tensor_copy(void * mem, bool toTensor=true);
+protected:
+ void* mem_;
+ bool share_;
+ TensorType type_;
+ bool allocate_;
+};
+class GPUTensor:public BaseTensor<CLTensor>{
+public:
+ explicit GPUTensor(bool share)
+ :BaseTensor(share){}
+ virtual void map(bool blocking = true){
+ if (!allocate_){
+ CLTensor::allocator()->allocate();
+ allocate_=true;
+ }
+ CLTensor::map(blocking);
+ }
+ virtual void unmap(){
+ CLTensor::unmap();
+ }
+};
+class CPUTensor:public BaseTensor<Tensor>{
+public:
+ explicit CPUTensor(bool share)
+ :BaseTensor(share){}
+ virtual void map(bool blocking = true){
+ if (!allocate_){
+ Tensor::allocator()->allocate();
+ allocate_=true;
+ }
+ }
+ virtual void unmap(){
+ }
+};
+template <typename ACLLayer,typename ACLTensor>
+class ACLXPUBaseLayer{
+public:
+ virtual void commit(){
+ if (input) {
+ input->settensortype(tensor_input);
+ input->commit();
+ }
+ if (output){
+ output->settensortype(tensor_output);
+ output->commit();
+ }
+ if (weights){
+ weights->settensortype(tensor_weights);
+ weights->commit();
+ }
+ if (biases){
+ biases->settensortype(tensor_biases);
+ biases->commit();
+ }
+ }
+ virtual void run(bool gpu){
+ commit();
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_RUN_INFO);
+#endif //USE_PROFILING
+ layer->run();
+ if (gpu) {
+ // Make sure all the OpenCL jobs are done executing:
+ CLScheduler::get().sync();
+ }
+ }
+ virtual bool reshape(TensorShape &shape,TensorType type);
+ explicit ACLXPUBaseLayer(){
+ layer=nullptr;
+ input=nullptr;
+ output=nullptr;
+ weights=nullptr;
+ biases=nullptr;
+#ifdef USE_CONV_CACHE
+ for(int i = 0; i < 16; ++i){
+ cache.layer[i] = nullptr;
+ cache.input[i] = nullptr;
+ cache.output[i] = nullptr;
+ cache.weights[i] = nullptr;
+ cache.biases[i] = nullptr;
+ }
+#endif //USE_CONV_CACHE
+ }
+ virtual void freelayer(){
+#ifndef USE_CONV_CACHE
+ if (layer) delete layer;
+ if (input) delete input;
+ if (output) delete output;
+ if (weights) delete weights;
+ if (biases) delete biases;
+#endif //USE_CONV_CACHE
+ layer=nullptr;
+ input=nullptr;
+ output=nullptr;
+ weights=nullptr;
+ biases=nullptr;
+ }
+ virtual ~ACLXPUBaseLayer(){
+ freelayer();
+ }
+ ACLLayer *layer;
+ ACLTensor *input;
+ ACLTensor *output;
+ ACLTensor *weights;
+ ACLTensor *biases;
+#ifdef USE_CONV_CACHE
+ struct{
+ ACLLayer *layer[16];
+ ACLTensor *input[16];
+ ACLTensor *output[16];
+ ACLTensor *weights[16];
+ ACLTensor *biases[16];
+ }cache;
+#endif //USE_CONV_CACHE
+};
+template <typename GPULayer, typename CPULayer>
+class ACLBaseLayer {
+public:
+ explicit ACLBaseLayer();
+ virtual void gpu_run();
+ virtual void cpu_run();
+ virtual ~ACLBaseLayer();
+ virtual GPULayer * new_gpulayer();
+ virtual CPULayer * new_cpulayer();
+ ACLXPUBaseLayer<GPULayer,GPUTensor>& gpu(){
+ return gpu_;
+ }
+ ACLXPUBaseLayer<CPULayer,CPUTensor>& cpu(){
+ return cpu_;
+ }
+ bool checkreshape(TensorShape shape,bool gpu=false, TensorType type=tensor_input);
+ template <typename ACLTensor> bool tensor_mem(ACLTensor *tensor,void *mem,bool share=false);
+ template <typename ACLTensor> bool tensor_mem(void *mem,ACLTensor *tensor,bool share=false);
+ template <typename ACLTensor> ACLTensor * new_tensor(TensorShape shape,void *mem=nullptr,bool share=false);
+protected:
+ ACLXPUBaseLayer<GPULayer,GPUTensor> gpu_;
+ ACLXPUBaseLayer<CPULayer,CPUTensor> cpu_;
+ bool init_layer_;
+ bool force_bypass_acl_path_;
+
+};
+
+#endif
+}
+#define INSTANTIATE_ACLBASECLASS(GPULayer,CPULayer) \
+ template class ACLBaseLayer<GPULayer,CPULayer>;
+
+#define INSTANTIATE_ACLBASE_FUNCTION(GPULayer,CPULayer,ACLTensor) \
+ template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem<ACLTensor>(ACLTensor *tensor,void *mem,bool share); \
+ template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share); \
+ template ACLTensor * ACLBaseLayer<GPULayer,CPULayer>::new_tensor(TensorShape shape,void *mem,bool share); \
+
+
+#endif
diff --git a/include/caffe/caffe.hpp b/include/caffe/caffe.hpp
index 06882096..6ea2bb5d 100644
--- a/include/caffe/caffe.hpp
+++ b/include/caffe/caffe.hpp
@@ -4,6 +4,12 @@
#ifndef CAFFE_CAFFE_HPP_
#define CAFFE_CAFFE_HPP_
+#ifdef USE_ACL
+#ifndef CPU_ONLY
+#define CPU_ONLY
+#endif
+#endif
+
#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 4904d1d8..34f2b8c4 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -145,7 +145,13 @@ class Caffe {
// into the program since that may cause allocation of pinned memory being
// freed in a non-pinned way, which may cause problems - I haven't verified
// it personally but better to note it here in the header file.
- inline static void set_mode(Brew mode) { Get().mode_ = mode; }
+#ifdef USE_ACL
+ inline static bool arm_gpu_mode() {return Get().use_mali_gpu_;}
+ inline static void set_mode(Brew mode) { Get().mode_ = CPU; set_arm_gpu_mode(mode==GPU);}
+ inline static void set_arm_gpu_mode(bool use_mali_gpu) { Get().use_mali_gpu_ = use_mali_gpu;}
+#else
+ inline static void set_mode(Brew mode) { Get().mode_ = mode;}
+#endif
// Sets the random seed of both boost and curand
static void set_random_seed(const unsigned int seed);
// Sets the device. Since we have cublas and curand stuff, set device also
@@ -175,7 +181,9 @@ class Caffe {
shared_ptr<RNG> random_generator_;
Brew mode_;
-
+#ifdef USE_ACL
+ bool use_mali_gpu_;
+#endif
// Parallel training
int solver_count_;
int solver_rank_;
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 30dbfd53..49b1e695 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -10,7 +10,11 @@
#include "caffe/layer_factory.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/math_functions.hpp"
-
+#ifdef USE_PROFILING
+#include <sys/time.h>
+#define NANO_SEC_CONV 1000000
+extern unsigned int acl_log_flags;
+#endif //USE_PROFILING
/**
Forward declare boost::thread instead of including boost/thread.hpp
to avoid a boost/NVCC issues (#1009, #1010) on OSX.
@@ -18,6 +22,64 @@
namespace boost { class mutex; }
namespace caffe {
+#ifdef USE_PROFILING
+class logtime_util
+{
+ public:
+ logtime_util(int mask_, const char* information_){
+ mask = mask_;
+ if(acl_log_flags & mask){
+ strncpy(information, information_, 255);
+ gettimeofday(&tv[0], NULL);
+ }
+ }
+ ~logtime_util(){
+ if(acl_log_flags & mask){
+ long time[2];
+ gettimeofday(&tv[1], NULL);
+ time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+ time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+ printf("%s %.6lf\n", information, (((double)time[1] - time[0]) / NANO_SEC_CONV));
+ }
+ }
+ void log_time(bool start)
+ {
+ if(acl_log_flags & mask){
+ if (start){
+ gettimeofday(&tv[0], NULL);
+ }
+ else{
+ long time[2];
+ gettimeofday(&tv[1], NULL);
+ time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+ time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+ printf("%s %.6lf\n", information, (((double)time[1] - time[0]) / NANO_SEC_CONV));
+ }
+ }
+ }
+private:
+ struct timeval tv[2];
+ int mask;
+ char information[256];
+};
+
+#ifdef LAYER_PERF_STAT
+
+struct perf_stat {
+
+uint64_t total;
+uint32_t start;
+uint32_t end;
+uint32_t used;
+uint32_t count;
+
+perf_stat(): total(0),start(0),end(0),count(0){};
+
+};
+
+
+#endif
+#endif //USE_PROFILING
/**
* @brief An interface for the units of computation which can be composed into a
@@ -123,8 +185,13 @@ class Layer {
*
* Your layer should implement Forward_cpu and (optionally) Forward_gpu.
*/
+#ifdef USE_PROFILING
+ Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+#else
inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+#endif //USE_PROFILING
/**
* @brief Given the top blob error gradients, compute the bottom blob error
@@ -290,7 +357,19 @@ class Layer {
}
param_propagate_down_[param_id] = value;
}
+
+#ifdef USE_PROFILING
+#ifdef LAYER_PERF_STAT
+
+ const vector<Blob<Dtype>*> * saved_top;
+ const vector<Blob<Dtype>*> * saved_bottom;
+ perf_stat * get_time_stat(void) { return &time_stat_;}
+ perf_stat * get_pmu_stat(int index) { return &pmu_stat_[index];}
+
+#endif
+
+#endif //USE_PROFILING
protected:
/** The protobuf that stores the layer parameters */
@@ -404,8 +483,17 @@ class Layer {
private:
DISABLE_COPY_AND_ASSIGN(Layer);
+
+#ifdef USE_PROFILING
+#ifdef LAYER_PERF_STAT
+ perf_stat time_stat_;
+ perf_stat pmu_stat_[16];
+#endif
+#endif //USE_PROFILING
}; // class Layer
+
+#ifndef LAYER_PERF_STAT
// Forward and backward wrappers. You should implement the cpu and
// gpu specific implementations instead, and should not change these
// functions.
@@ -445,6 +533,8 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
return loss;
}
+#endif
+
template <typename Dtype>
inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
diff --git a/include/caffe/layers/acl_absval_layer.hpp b/include/caffe/layers/acl_absval_layer.hpp
new file mode 100644
index 00000000..c1655404
--- /dev/null
+++ b/include/caffe/layers/acl_absval_layer.hpp
@@ -0,0 +1,57 @@
+#ifndef CAFFE_ACL_ABSVAL_LAYER_HPP_
+#define CAFFE_ACL_ABSVAL_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/absval_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#include "caffe/layers/acl_base_activation_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of AbsValLayer.
+ * Fallback to AbsValLayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLAbsValLayer : public ACLBaseActivationLayer<Dtype>,public AbsValLayer<Dtype> {
+ public:
+ explicit ACLAbsValLayer(const LayerParameter& param)
+ : ACLBaseActivationLayer<Dtype>(param),AbsValLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLAbsValLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_ABSVAL_LAYER_HPP_
diff --git a/include/caffe/layers/acl_base_activation_layer.hpp b/include/caffe/layers/acl_base_activation_layer.hpp
new file mode 100644
index 00000000..e2abdafa
--- /dev/null
+++ b/include/caffe/layers/acl_base_activation_layer.hpp
@@ -0,0 +1,56 @@
+#ifndef CAFFE_ACL_BASE_ACTIVATION_LAYER_HPP_
+#define CAFFE_ACL_BASE_ACTIVATION_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#include "caffe/layers/acl_base_activation_layer.hpp"
+#endif
+
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of BNLLLayer.
+ * Fallback to BNLLLayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivationLayer> {
+ public:
+ explicit ACLBaseActivationLayer(const LayerParameter& param)
+ {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLBaseActivationLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type=ActivationLayerInfo::ActivationFunction::RELU);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_BASE_ACTIVATION_LAYER_HPP_
diff --git a/include/caffe/layers/acl_bnll_layer.hpp b/include/caffe/layers/acl_bnll_layer.hpp
new file mode 100644
index 00000000..ea2f8a16
--- /dev/null
+++ b/include/caffe/layers/acl_bnll_layer.hpp
@@ -0,0 +1,57 @@
+#ifndef CAFFE_ACL_BNLL_LAYER_HPP_
+#define CAFFE_ACL_BNLL_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/bnll_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#include "caffe/layers/acl_base_activation_layer.hpp"
+#endif
+
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of BNLLLayer.
+ * Fallback to BNLLLayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLBNLLLayer : public ACLBaseActivationLayer<Dtype>,public BNLLLayer<Dtype> {
+ public:
+ explicit ACLBNLLLayer(const LayerParameter& param)
+ : ACLBaseActivationLayer<Dtype>(param),BNLLLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLBNLLLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_BNLL_LAYER_HPP_
diff --git a/include/caffe/layers/acl_conv_layer.hpp b/include/caffe/layers/acl_conv_layer.hpp
new file mode 100644
index 00000000..b4a75848
--- /dev/null
+++ b/include/caffe/layers/acl_conv_layer.hpp
@@ -0,0 +1,56 @@
+#ifndef CAFFE_ACL_CONV_LAYER_HPP_
+#define CAFFE_ACL_CONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/conv_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/*
+ * @brief ACL implementation of ConvolutionLayer.
+ * Fallback to ConvolutionLayer for some corner cases.
+ *
+*/
+template <typename Dtype>
+class ACLConvolutionLayer : public ACLBaseLayer<CLConvolutionLayer,NEConvolutionLayer>,public ConvolutionLayer<Dtype> {
+ public:
+ explicit ACLConvolutionLayer(const LayerParameter& param)
+ : ConvolutionLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLConvolutionLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_CONV_LAYER_HPP_
diff --git a/include/caffe/layers/acl_inner_product_layer.hpp b/include/caffe/layers/acl_inner_product_layer.hpp
new file mode 100644
index 00000000..f42becb0
--- /dev/null
+++ b/include/caffe/layers/acl_inner_product_layer.hpp
@@ -0,0 +1,54 @@
+#ifndef CAFFE_ACL_INNER_PRODUCT_LAYER_HPP_
+#define CAFFE_ACL_INNER_PRODUCT_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/inner_product_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of InnerProductLayer.
+ * Fallback to InnerProductLayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyConnectedLayer>,public InnerProductLayer<Dtype> {
+ public:
+ explicit ACLInnerProductLayer(const LayerParameter& param)
+ : InnerProductLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLInnerProductLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_INNER_PRODUCT_LAYER_HPP_
diff --git a/include/caffe/layers/acl_lrn_layer.hpp b/include/caffe/layers/acl_lrn_layer.hpp
new file mode 100644
index 00000000..6fd9fbc8
--- /dev/null
+++ b/include/caffe/layers/acl_lrn_layer.hpp
@@ -0,0 +1,54 @@
+#ifndef CAFFE_ACL_LRN_LAYER_HPP_
+#define CAFFE_ACL_LRN_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/lrn_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/*
+ * @brief ACL implementation of LRNLayer.
+ * Fallback to LRNLayer for some corner cases.
+*/
+template <typename Dtype>
+class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLayer>,public LRNLayer<Dtype> {
+ public:
+ explicit ACLLRNLayer(const LayerParameter& param)
+ : LRNLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLLRNLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_LRN_LAYER_HPP_
diff --git a/include/caffe/layers/acl_pooling_layer.hpp b/include/caffe/layers/acl_pooling_layer.hpp
new file mode 100644
index 00000000..acca35cf
--- /dev/null
+++ b/include/caffe/layers/acl_pooling_layer.hpp
@@ -0,0 +1,54 @@
+#ifndef CAFFE_ACL_POOLING_LAYER_HPP_
+#define CAFFE_ACL_POOLING_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/pooling_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/*
+ * @brief ACL implementation of PoolingLayer.
+ * Fallback to PoolingLayer for some corner cases.
+*/
+template <typename Dtype>
+class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,public PoolingLayer<Dtype> {
+ public:
+ explicit ACLPoolingLayer(const LayerParameter& param)
+ : PoolingLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLPoolingLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_POOLING_LAYER_HPP_
diff --git a/include/caffe/layers/acl_relu_layer.hpp b/include/caffe/layers/acl_relu_layer.hpp
new file mode 100644
index 00000000..041dbecf
--- /dev/null
+++ b/include/caffe/layers/acl_relu_layer.hpp
@@ -0,0 +1,56 @@
+#ifndef CAFFE_ACL_RELU_LAYER_HPP_
+#define CAFFE_ACL_RELU_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/relu_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#include "caffe/layers/acl_base_activation_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of ReLULayer.
+ * Fallback to ReLULayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLReLULayer : public ACLBaseActivationLayer<Dtype>,public ReLULayer<Dtype> {
+ public:
+ explicit ACLReLULayer(const LayerParameter& param)
+ : ACLBaseActivationLayer<Dtype>(param), ReLULayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLReLULayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_RELU_LAYER_HPP_
diff --git a/include/caffe/layers/acl_sigmoid_layer.hpp b/include/caffe/layers/acl_sigmoid_layer.hpp
new file mode 100644
index 00000000..8638f73e
--- /dev/null
+++ b/include/caffe/layers/acl_sigmoid_layer.hpp
@@ -0,0 +1,55 @@
+#ifndef CAFFE_ACL_SIGMOID_LAYER_HPP_
+#define CAFFE_ACL_SIGMOID_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/sigmoid_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#include "caffe/layers/acl_base_activation_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of SigmoidLayer.
+ */
+template <typename Dtype>
+class ACLSigmoidLayer : public ACLBaseActivationLayer<Dtype>,public SigmoidLayer<Dtype> {
+ public:
+ explicit ACLSigmoidLayer(const LayerParameter& param)
+ : ACLBaseActivationLayer<Dtype>(param),SigmoidLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLSigmoidLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_SIGMOID_LAYER_HPP_
diff --git a/include/caffe/layers/acl_softmax_layer.hpp b/include/caffe/layers/acl_softmax_layer.hpp
new file mode 100644
index 00000000..9e450f5d
--- /dev/null
+++ b/include/caffe/layers/acl_softmax_layer.hpp
@@ -0,0 +1,54 @@
+#ifndef CAFFE_ACL_SOFTMAX_LAYER_HPP_
+#define CAFFE_ACL_SOFTMAX_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/softmax_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL implementation of SoftmaxLayer.
+ * Fallback to SoftmaxLayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,public SoftmaxLayer<Dtype> {
+ public:
+ explicit ACLSoftmaxLayer(const LayerParameter& param)
+ : SoftmaxLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLSoftmaxLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_SOFTMAX_LAYER_HPP_
diff --git a/include/caffe/layers/acl_tanh_layer.hpp b/include/caffe/layers/acl_tanh_layer.hpp
new file mode 100644
index 00000000..5a74ce5a
--- /dev/null
+++ b/include/caffe/layers/acl_tanh_layer.hpp
@@ -0,0 +1,56 @@
+#ifndef CAFFE_ACL_TANH_LAYER_HPP_
+#define CAFFE_ACL_TANH_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/tanh_layer.hpp"
+
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+#include "caffe/layers/acl_base_activation_layer.hpp"
+#endif
+
+namespace caffe {
+
+#ifdef USE_ACL
+/**
+ * @brief ACL acceleration of TanHLayer.
+ * Fallback to TanHLayer for some corner cases.
+ */
+template <typename Dtype>
+class ACLTanHLayer : public ACLBaseActivationLayer<Dtype>,public TanHLayer<Dtype> {
+ public:
+ explicit ACLTanHLayer(const LayerParameter& param)
+ : ACLBaseActivationLayer<Dtype>(param),TanHLayer<Dtype>(param) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual ~ACLTanHLayer();
+
+ protected:
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+ NOT_IMPLEMENTED;
+ }
+ virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+};
+#endif
+
+} // namespace caffe
+
+#endif // CAFFE_ACL_TANH_LAYER_HPP_
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index e3fe4fe2..e7ab4b70 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -7,7 +7,11 @@
// Stub out GPU calls as unavailable.
+#ifdef USE_ACL
+#define NO_GPU
+#else
#define NO_GPU LOG(FATAL) << "Cannot use GPU in CPU-only Caffe: check mode."
+#endif
#define STUB_GPU(classname) \
template <typename Dtype> \
diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
index 71549c1c..dbd8bb6c 100644
--- a/include/caffe/util/hdf5.hpp
+++ b/include/caffe/util/hdf5.hpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#ifndef CAFFE_UTIL_HDF5_H_
#define CAFFE_UTIL_HDF5_H_
@@ -37,3 +38,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx);
} // namespace caffe
#endif // CAFFE_UTIL_HDF5_H_
+#endif // USE_HDF5
diff --git a/src/caffe/acl_layer.cpp b/src/caffe/acl_layer.cpp
new file mode 100644
index 00000000..4c20037a
--- /dev/null
+++ b/src/caffe/acl_layer.cpp
@@ -0,0 +1,274 @@
+#ifdef USE_ACL
+#include "caffe/acl_layer.hpp"
+
+unsigned int bypass_acl_class_layer = (0 | \
+ /*0xffffffff |*/ \
+ /*FLAGS_ENABLE_ACL_FC |*/ \
+ /*FLAGS_ENABLE_ACL_LRN |*/ \
+ 0 );
+
+#ifdef USE_PROFILING
+
+#include "arm_neon.h"
+
+unsigned int acl_log_flags = (0 | \
+ MASK_LOG_APP_TIME | \
+ /*MASK_LOG_ALLOCATE | */\
+ /*MASK_LOG_ALLOCATE | */\
+ /*MASK_LOG_RUN | */\
+ /*MASK_LOG_CONFIG | */\
+ /*MASK_LOG_COPY | */\
+ MASK_LOG_ABSVAL | \
+ MASK_LOG_BNLL | \
+ MASK_LOG_CONV | \
+ MASK_LOG_FC | \
+ MASK_LOG_LRN | \
+ MASK_LOG_POOLING | \
+ MASK_LOG_RELU | \
+ MASK_LOG_SIGMOID | \
+ MASK_LOG_SOFTMAX | \
+ MASK_LOG_TANH | \
+ 0);
+#include <stdio.h> /* printf */
+#include <stdlib.h> /* getenv */
+#endif //USE_PROFILING
+
+namespace caffe {
+template <typename GPULayer, typename CPULayer>
+ACLBaseLayer<GPULayer,CPULayer>::ACLBaseLayer()
+ :init_layer_(true),force_bypass_acl_path_(false){
+ const char* pBypassACL;
+ pBypassACL = getenv ("BYPASSACL");
+ if (pBypassACL){
+ unsigned int bacl;
+ sscanf(pBypassACL,"%i", &bacl);
+ if(bacl != bypass_acl_class_layer){
+ bypass_acl_class_layer = bacl;
+ printf("BYPASSACL<%s>\n", pBypassACL);
+ printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+ }
+ }
+#ifdef USE_PROFILING
+ const char* pLogACL;
+ pLogACL = getenv("LOGACL");
+ if (pLogACL){
+ unsigned int alf;
+ sscanf(pLogACL,"%i", &alf);
+ if (alf != acl_log_flags){
+ acl_log_flags = alf;
+ printf("LOGACL<%s>\n", pLogACL);
+ printf("LOGACL: %x\n", acl_log_flags);
+ }
+ }
+#endif //USE_PROFILING
+}
+template <typename GPULayer, typename CPULayer>
+void ACLBaseLayer<GPULayer,CPULayer>::gpu_run() {
+ gpu_.run(true);
+}
+template <typename GPULayer, typename CPULayer>
+void ACLBaseLayer<GPULayer,CPULayer>::cpu_run() {
+ cpu_.run(false);
+}
+
+template <typename GPULayer, typename CPULayer>
+ACLBaseLayer<GPULayer,CPULayer>::~ACLBaseLayer(){
+}
+template <typename GPULayer, typename CPULayer>
+template <typename ACLTensor> ACLTensor * ACLBaseLayer<GPULayer,CPULayer>::new_tensor(TensorShape shape,void *mem,bool share)
+{
+ ACLTensor * tensor=new ACLTensor(share);
+#if 1 //F32
+ tensor->allocator()->init(TensorInfo(shape, Format::F32));
+#else //F16
+ tensor->allocator()->init(TensorInfo(shape, Format::F16));
+#endif
+ tensor->bindmem(mem,share);
+ return tensor;
+}
+
+template <typename ACLTensor>
+void BaseTensor<ACLTensor>::commit(){
+ if (!share_&&mem_) {
+ if (!allocate_){
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif //USE_PROFILING
+ ACLTensor::allocator()->allocate();
+ allocate_=true;
+ }
+ if (type_!= tensor_output) {
+ tensor_copy(mem_);
+ }
+ mem_=nullptr;
+ }
+}
+
+template <typename ACLTensor>
+int BaseTensor<ACLTensor>::tensor_copy(void * mem,bool toTensor)
+{
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_COPY_INFO);
+#endif //USE_PROFILING
+ arm_compute::Window window;
+ ACLTensor* tensor=this;
+ window.use_tensor_dimensions(tensor->info(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
+ int width = tensor->info()->tensor_shape()[0]; //->dimension(0); //window.x().end() - window.x().start(); // + 1;
+ int height = tensor->info()->tensor_shape()[1]; //->dimension(1); //window.y().end() - window.y().start(); // + 1;
+ int deepth = tensor->info()->tensor_shape()[2];
+ map();
+ // Create an iterator:
+ arm_compute::Iterator it(tensor, window);
+ // Except it works for an arbitrary number of dimensions
+ if (toTensor) { //mem->tensor
+ arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+ {
+#if 0 //F16
+ if (tensor->info()->element_size() ==2)
+ {
+ for(int i = 0; i < width; i+= 4){
+ auto pa = (float32x4_t*)((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4);
+ *(float16x4_t*)(((char*)it.ptr()) + i*2) = vcvt_f16_f32(*pa);
+ }
+ }
+ else{
+#endif
+ memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size());
+#if 0 //F16
+ }
+#endif
+ },
+ it);
+ }else{ //tensor-->mem
+ arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+ {
+#if 0 //F16
+ if (tensor->info()->element_size() ==2)
+ {
+ for(int i = 0; i < width; i+= 4){
+ auto pa = (float32x4_t*)(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4));
+ *pa = vcvt_f32_f16(*(float16x4_t*)(((char*)it.ptr()) + i*2));
+ }
+ }
+ else{
+#endif
+ memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size());
+#if 0 //F16
+ }
+#endif
+ },
+ it);
+ }
+ unmap();
+
+ return 0;
+}
+
+template <typename GPULayer, typename CPULayer>
+template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share)
+{
+ tensor->bindmem(mem,share);
+ return true;
+}
+
+template <typename GPULayer, typename CPULayer>
+template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share)
+{
+ if (mem==tensor->buffer()) return true;
+ if (!share) {
+ tensor->tensor_copy(mem,false);
+ }
+ return true;
+}
+
+
+template <typename GPULayer, typename CPULayer>
+bool ACLBaseLayer<GPULayer,CPULayer>::checkreshape(TensorShape shape,bool gpu, TensorType type)
+{
+ if (gpu) {
+ init_layer_ = gpu_.reshape(shape,type);
+ }else{
+ init_layer_ = cpu_.reshape(shape,type);
+ }
+ return init_layer_;
+}
+
+template <typename GPULayer, typename CPULayer>
+GPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_gpulayer(){
+ gpu_.layer= new GPULayer;
+ return gpu_.layer;
+}
+template <typename GPULayer, typename CPULayer>
+CPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_cpulayer(){
+ cpu_.layer= new CPULayer;
+ return cpu_.layer;
+}
+template <typename ACLLayer,typename ACLTensor>
+bool ACLXPUBaseLayer<ACLLayer,ACLTensor>::reshape(TensorShape &shape,TensorType type)
+{
+ TensorShape _shape;
+ if (!layer) return true;
+#ifdef USE_CONV_CACHE
+ if (tensor_input == type){
+ _shape = input->info()->tensor_shape();
+ if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
+ return false;
+ }
+ for(int i = 0; i < 16; ++i){
+ if(cache.input[i] == nullptr) break;
+ _shape = cache.input[i]->info()->tensor_shape();
+ if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
+ this->layer = cache.layer[i];
+ this->input = cache.input[i];
+ this->output = cache.output[i];
+ this->weights = cache.weights[i];
+ this->biases = cache.biases[i];
+ return false;
+ }
+ }
+ }
+#endif //USE_CONV_CACHE
+ switch (type) {
+ case tensor_biases:
+ _shape = biases->info()->tensor_shape();
+ break;
+ case tensor_weights:
+ _shape = weights->info()->tensor_shape();
+ break;
+ case tensor_output:
+ _shape = output->info()->tensor_shape();
+ break;
+ case tensor_input:
+ default:
+ _shape = input->info()->tensor_shape();
+ break;
+ }
+ if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
+ return false;
+ }
+ freelayer();
+ return true;
+}
+
+INSTANTIATE_ACLBASECLASS(CLNormalizationLayer,NENormalizationLayer);
+ INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,GPUTensor);
+ INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,CPUTensor);
+INSTANTIATE_ACLBASECLASS(CLActivationLayer,NEActivationLayer);
+ INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,GPUTensor);
+ INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,CPUTensor);
+INSTANTIATE_ACLBASECLASS(CLPoolingLayer,NEPoolingLayer);
+ INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,GPUTensor);
+ INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,CPUTensor);
+INSTANTIATE_ACLBASECLASS(CLSoftmaxLayer,NESoftmaxLayer);
+ INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,GPUTensor);
+ INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,CPUTensor);
+INSTANTIATE_ACLBASECLASS(CLFullyConnectedLayer,NEFullyConnectedLayer);
+ INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,GPUTensor);
+ INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,CPUTensor);
+INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEConvolutionLayer);
+ INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,GPUTensor);
+ INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,CPUTensor);
+
+}
+
+#endif
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 4f6f9bcc..63097caa 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -6,6 +6,10 @@
#include "caffe/common.hpp"
#include "caffe/util/rng.hpp"
+#ifdef USE_ACL
+#include "arm_compute/runtime/CL/CLScheduler.h"
+using namespace arm_compute;
+#endif
namespace caffe {
@@ -52,8 +56,12 @@ void GlobalInit(int* pargc, char*** pargv) {
#ifdef CPU_ONLY // CPU-only Caffe.
Caffe::Caffe()
- : random_generator_(), mode_(Caffe::CPU),
- solver_count_(1), solver_rank_(0), multiprocess_(false) { }
+ : random_generator_(), mode_(Caffe::CPU),use_mali_gpu_(false),
+ solver_count_(1), solver_rank_(0), multiprocess_(false) {
+#ifdef USE_ACL
+ CLScheduler::get().default_init();
+#endif
+}
Caffe::~Caffe() { }
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 684ae88b..677ed100 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -1,7 +1,81 @@
#include "caffe/layer.hpp"
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+#include <time.h>
+
+#endif
+#endif //USE_PROFILING
+
namespace caffe {
INSTANTIATE_CLASS(Layer);
+#ifdef USE_PROFILING
+#ifdef LAYER_PERF_STAT
+
+/* current timestamp in us */
+unsigned long get_cur_time(void)
+{
+ struct timespec tm;
+
+ clock_gettime(CLOCK_MONOTONIC_COARSE, &tm);
+
+ return (tm.tv_sec*1000000+tm.tv_nsec/1000);
+}
+
+
+// Forward and backward wrappers. You should implement the cpu and
+// gpu specific implementations instead, and should not change these
+// functions.
+template <typename Dtype>
+Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ Dtype loss = 0;
+ Reshape(bottom, top);
+
+ saved_top=&top;
+ saved_bottom=&bottom;
+
+ time_stat_.count++;
+ time_stat_.start=get_cur_time();
+
+ switch (Caffe::mode()) {
+ case Caffe::CPU:
+ Forward_cpu(bottom, top);
+ for (int top_id = 0; top_id < top.size(); ++top_id) {
+ if (!this->loss(top_id)) { continue; }
+ const int count = top[top_id]->count();
+ const Dtype* data = top[top_id]->cpu_data();
+ const Dtype* loss_weights = top[top_id]->cpu_diff();
+ loss += caffe_cpu_dot(count, data, loss_weights);
+ }
+ break;
+ case Caffe::GPU:
+ Forward_gpu(bottom, top);
+#ifndef CPU_ONLY
+ for (int top_id = 0; top_id < top.size(); ++top_id) {
+ if (!this->loss(top_id)) { continue; }
+ const int count = top[top_id]->count();
+ const Dtype* data = top[top_id]->gpu_data();
+ const Dtype* loss_weights = top[top_id]->gpu_diff();
+ Dtype blob_loss = 0;
+ caffe_gpu_dot(count, data, loss_weights, &blob_loss);
+ loss += blob_loss;
+ }
+#endif
+ break;
+ default:
+ LOG(FATAL) << "Unknown caffe mode.";
+ }
+ time_stat_.end=get_cur_time();
+ time_stat_.used=time_stat_.end-time_stat_.start;
+ time_stat_.total+=time_stat_.used;
+ return loss;
+}
+
+#endif
+#endif //USE_PROFILING
+
} // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index f14253a5..f7bf8863 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -27,6 +27,19 @@
#include "caffe/layers/cudnn_tanh_layer.hpp"
#endif
+#ifdef USE_ACL
+#include "caffe/layers/acl_absval_layer.hpp"
+#include "caffe/layers/acl_bnll_layer.hpp"
+#include "caffe/layers/acl_conv_layer.hpp"
+#include "caffe/layers/acl_inner_product_layer.hpp"
+#include "caffe/layers/acl_lrn_layer.hpp"
+#include "caffe/layers/acl_pooling_layer.hpp"
+#include "caffe/layers/acl_relu_layer.hpp"
+#include "caffe/layers/acl_sigmoid_layer.hpp"
+#include "caffe/layers/acl_softmax_layer.hpp"
+#include "caffe/layers/acl_tanh_layer.hpp"
+#endif
+
#ifdef WITH_PYTHON_LAYER
#include "caffe/layers/python_layer.hpp"
#endif
@@ -39,6 +52,9 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
const LayerParameter& param) {
ConvolutionParameter conv_param = param.convolution_param();
ConvolutionParameter_Engine engine = conv_param.engine();
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype>(param));
+#endif
#ifdef USE_CUDNN
bool use_dilation = false;
for (int i = 0; i < conv_param.dilation_size(); ++i) {
@@ -77,6 +93,9 @@ REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
PoolingParameter_Engine engine = param.pooling_param().engine();
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLPoolingLayer<Dtype>(param));
+#endif
if (engine == PoolingParameter_Engine_DEFAULT) {
engine = PoolingParameter_Engine_CAFFE;
#ifdef USE_CUDNN
@@ -115,7 +134,9 @@ REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {
LRNParameter_Engine engine = param.lrn_param().engine();
-
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLLRNLayer<Dtype>(param));
+#endif
if (engine == LRNParameter_Engine_DEFAULT) {
#ifdef USE_CUDNN
engine = LRNParameter_Engine_CUDNN;
@@ -153,6 +174,9 @@ REGISTER_LAYER_CREATOR(LRN, GetLRNLayer);
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
ReLUParameter_Engine engine = param.relu_param().engine();
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLReLULayer<Dtype>(param));
+#endif
if (engine == ReLUParameter_Engine_DEFAULT) {
engine = ReLUParameter_Engine_CAFFE;
#ifdef USE_CUDNN
@@ -177,6 +201,9 @@ REGISTER_LAYER_CREATOR(ReLU, GetReLULayer);
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
SigmoidParameter_Engine engine = param.sigmoid_param().engine();
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLSigmoidLayer<Dtype>(param));
+#endif
if (engine == SigmoidParameter_Engine_DEFAULT) {
engine = SigmoidParameter_Engine_CAFFE;
#ifdef USE_CUDNN
@@ -201,6 +228,9 @@ REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer);
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
SoftmaxParameter_Engine engine = param.softmax_param().engine();
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLSoftmaxLayer<Dtype>(param));
+#endif
if (engine == SoftmaxParameter_Engine_DEFAULT) {
engine = SoftmaxParameter_Engine_CAFFE;
#ifdef USE_CUDNN
@@ -225,6 +255,9 @@ REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer);
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
TanHParameter_Engine engine = param.tanh_param().engine();
+#ifdef USE_ACL
+ return shared_ptr<Layer<Dtype> >(new ACLTanHLayer<Dtype>(param));
+#endif
if (engine == TanHParameter_Engine_DEFAULT) {
engine = TanHParameter_Engine_CAFFE;
#ifdef USE_CUDNN
@@ -245,6 +278,33 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
REGISTER_LAYER_CREATOR(TanH, GetTanHLayer);
+#ifdef USE_ACL
+// Get AbsVal layer according to engine.
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetAbsValLayer(const LayerParameter& param) {
+ return shared_ptr<Layer<Dtype> >(new ACLAbsValLayer<Dtype>(param));
+}
+
+REGISTER_LAYER_CREATOR(AbsVal, GetAbsValLayer);
+
+// Get BNLL layer according to engine.
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetBNLLLayer(const LayerParameter& param) {
+ return shared_ptr<Layer<Dtype> >(new ACLBNLLLayer<Dtype>(param));
+}
+
+REGISTER_LAYER_CREATOR(BNLL, GetBNLLLayer);
+
+// Get InnerProduct layer according to engine.
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetInnerProductLayer(const LayerParameter& param) {
+ return shared_ptr<Layer<Dtype> >(new ACLInnerProductLayer<Dtype>(param));
+}
+
+REGISTER_LAYER_CREATOR(InnerProduct, GetInnerProductLayer);
+
+#endif // USE_ACL
+
#ifdef WITH_PYTHON_LAYER
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetPythonLayer(const LayerParameter& param) {
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 855bf0bf..65980e4d 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -39,6 +39,8 @@ STUB_GPU(AbsValLayer);
#endif
INSTANTIATE_CLASS(AbsValLayer);
+#ifndef USE_ACL
REGISTER_LAYER_CLASS(AbsVal);
+#endif
} // namespace caffe
diff --git a/src/caffe/layers/acl_absval_layer.cpp b/src/caffe/layers/acl_absval_layer.cpp
new file mode 100644
index 00000000..b0b0304d
--- /dev/null
+++ b/src/caffe/layers/acl_absval_layer.cpp
@@ -0,0 +1,64 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/layers/acl_absval_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLAbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ AbsValLayer<Dtype>::LayerSetUp(bottom, top);
+ ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_ABSVAL;
+}
+
+template <typename Dtype>
+void ACLAbsValLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::ABS);
+}
+
+template <typename Dtype>
+void ACLAbsValLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ AbsValLayer<Dtype>::Reshape(bottom, top);
+ ACLBaseActivationLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLAbsValLayer<Dtype>::Forward_cpu(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_ABSVAL_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ AbsValLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
+}
+
+template <typename Dtype>
+void ACLAbsValLayer<Dtype>::Forward_gpu(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_ABSVAL_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ AbsValLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top);
+}
+
+template <typename Dtype>
+ACLAbsValLayer<Dtype>::~ACLAbsValLayer() {
+}
+
+INSTANTIATE_CLASS(ACLAbsValLayer);
+
+} // namespace caffe
+
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_base_activation_layer.cpp b/src/caffe/layers/acl_base_activation_layer.cpp
new file mode 100644
index 00000000..2595a951
--- /dev/null
+++ b/src/caffe/layers/acl_base_activation_layer.cpp
@@ -0,0 +1,97 @@
+#ifdef USE_ACL
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/acl_base_activation_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLBaseActivationLayer<Dtype>::LayerSetUp(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+}
+template <typename Dtype>
+void ACLBaseActivationLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
+
+ const unsigned int count = bottom[0]->count();
+ const unsigned int count_ = top[0]->count();
+ TensorShape input_shape(count);
+ TensorShape output_shape(count_);
+ checkreshape(input_shape,Caffe::arm_gpu_mode());
+ if (!this->init_layer_) return;
+ this->init_layer_=false;
+ // Initialize ACL.
+ if (Caffe::arm_gpu_mode()) {
+ new_gpulayer();
+ }else{
+ new_cpulayer();
+ }
+
+ this->force_bypass_acl_path_=false;
+ ActivationLayerInfo act_info(type);
+
+ if(type== ActivationLayerInfo::ActivationFunction::TANH)
+ act_info=ActivationLayerInfo(type,1.0,1.0);
+
+
+
+ if (Caffe::arm_gpu_mode()) {
+ Dtype *top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ this->gpu().input=new_tensor<GPUTensor>(input_shape,(void*)bottom_data);
+ this->gpu().output=new_tensor<GPUTensor>(output_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->gpu().layer->configure(this->gpu().input,this->gpu().output,act_info);
+ }else{
+ Dtype *top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ this->cpu().input=new_tensor<CPUTensor>(input_shape,(void*)bottom_data);
+ this->cpu().output=new_tensor<CPUTensor>(output_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->cpu().layer->configure(this->cpu().input,this->cpu().output,act_info);
+ }
+}
+template <typename Dtype>
+void ACLBaseActivationLayer<Dtype>::Reshape(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+}
+
+template <typename Dtype>
+void ACLBaseActivationLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ if(Caffe::arm_gpu_mode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+ Dtype* top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ SetupACLLayer(bottom,top);
+ tensor_mem(this->cpu().input,(void*)(bottom_data));
+ cpu_run();
+ tensor_mem((void*)(top_data),this->cpu().output);
+}
+
+template <typename Dtype>
+void ACLBaseActivationLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ Dtype* top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ SetupACLLayer(bottom,top);
+ tensor_mem(this->gpu().input,(void*)(bottom_data));
+ gpu_run();
+ tensor_mem((void*)(top_data),this->gpu().output);
+}
+
+template <typename Dtype>
+ACLBaseActivationLayer<Dtype>::~ACLBaseActivationLayer() {
+}
+
+INSTANTIATE_CLASS(ACLBaseActivationLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_bnll_layer.cpp b/src/caffe/layers/acl_bnll_layer.cpp
new file mode 100644
index 00000000..86f09831
--- /dev/null
+++ b/src/caffe/layers/acl_bnll_layer.cpp
@@ -0,0 +1,61 @@
+#ifdef USE_ACL
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/acl_bnll_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLBNLLLayer<Dtype>::LayerSetUp(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ BNLLLayer<Dtype>::LayerSetUp(bottom, top);
+ ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BNLL;
+}
+template <typename Dtype>
+void ACLBNLLLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::SOFT_RELU);
+}
+template <typename Dtype>
+void ACLBNLLLayer<Dtype>::Reshape(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ BNLLLayer<Dtype>::Reshape(bottom, top);
+ ACLBaseActivationLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLBNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_BNLL_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ BNLLLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
+}
+
+template <typename Dtype>
+void ACLBNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_BNLL_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ BNLLLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top);
+}
+
+template <typename Dtype>
+ACLBNLLLayer<Dtype>::~ACLBNLLLayer() {
+}
+
+INSTANTIATE_CLASS(ACLBNLLLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_conv_layer.cpp b/src/caffe/layers/acl_conv_layer.cpp
new file mode 100644
index 00000000..02732fb8
--- /dev/null
+++ b/src/caffe/layers/acl_conv_layer.cpp
@@ -0,0 +1,218 @@
+#ifdef USE_ACL
+#include <algorithm>
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/acl_conv_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::LayerSetUp(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+
+ TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+ checkreshape(input_shape,Caffe::arm_gpu_mode());
+ if (!this->init_layer_) return;
+ this->init_layer_=false;
+ // Initialize ACL.
+ if (Caffe::arm_gpu_mode()) {
+ new_gpulayer();
+ }else{
+ new_cpulayer();
+ }
+ this->force_bypass_acl_path_=false;
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ int stride_x =this->stride_.mutable_cpu_data()[1];
+ int stride_y =this->stride_.mutable_cpu_data()[0];
+ int pad_x=this->pad_.mutable_cpu_data()[1];
+ int pad_y=this->pad_.mutable_cpu_data()[0];
+ unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1];
+ unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0];
+ PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
+ TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
+ TensorShape biases_shape ((unsigned int)this->num_output_);
+ TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
+
+ if (Caffe::arm_gpu_mode()) {
+ Dtype *top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ //[kernel_x, kernel_y, IFM, OFM]
+ this->gpu().weights=new_tensor<GPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
+ tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
+ //[OFM]
+ if (this->bias_term_) {
+ this->gpu().biases=new_tensor<GPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
+ tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
+ }
+
+ //[width, height, IFM]
+ this->gpu().input=new_tensor<GPUTensor>(input_shape,(void*)bottom_data);
+ //[width, height, OFM]
+ this->gpu().output=new_tensor<GPUTensor>(output_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ {
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info);
+#ifdef USE_PROFILING
+ }
+#endif //USE_PROFILING
+#ifdef USE_CONV_CACHE
+ for(int i = 0; i < 16; ++i){
+ fprintf(stderr, "<GPU>check cache[%d]\n", i);
+ if(this->gpu().cache.layer[i] == nullptr){
+ this->gpu().cache.layer[i] = this->gpu().layer;
+ this->gpu().cache.input[i] = this->gpu().input;
+ this->gpu().cache.output[i] = this->gpu().output;
+ this->gpu().cache.weights[i] = this->gpu().weights;
+ this->gpu().cache.biases[i] = this->gpu().biases;
+ break;
+ }
+ }
+#endif //USE_CONV_CACHE
+ }else{
+ Dtype *top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ //[kernel_x, kernel_y, IFM, OFM]
+ this->cpu().weights=new_tensor<CPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
+ tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
+ //[OFM]
+ if (this->bias_term_) {
+ this->cpu().biases=new_tensor<CPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
+ tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
+ }
+
+ //[width, height, IFM]
+ this->cpu().input=new_tensor<CPUTensor>(input_shape,(void*)bottom_data);
+ //[width, height, OFM]
+ this->cpu().output=new_tensor<CPUTensor>(output_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ {
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info);
+#ifdef USE_PROFILING
+ }
+#endif //USE_PROFILING
+#ifdef USE_CONV_CACHE
+ for(int i = 0; i < 16; ++i){
+ fprintf(stderr, "<CPU>check cache[%d]\n", i);
+ if(this->cpu().cache.layer[i] == nullptr){
+ this->cpu().cache.layer[i] = this->cpu().layer;
+ this->cpu().cache.input[i] = this->cpu().input;
+ this->cpu().cache.output[i] = this->cpu().output;
+ this->cpu().cache.weights[i] = this->cpu().weights;
+ this->cpu().cache.biases[i] = this->cpu().biases;
+ break;
+ }
+ }
+#endif //USE_CONV_CACHE
+ }
+}
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Reshape(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ ConvolutionLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Forward_cpu(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+ if(Caffe::arm_gpu_mode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONV_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_|| this->group_!=1) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ /* check dilation */
+ int dilated=0;
+
+ for(int i=0;i<this->num_spatial_axes_;i++)
+ {
+ const int *p=this->dilation_.cpu_data();
+
+ if(p[i]!=1)
+ dilated=1;
+ }
+ if(dilated) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+
+ SetupACLLayer(bottom,top);
+ for (int i = 0; i < bottom.size(); ++i) {
+ const Dtype* bottom_data = bottom[i]->cpu_data();
+ Dtype* top_data = top[i]->mutable_cpu_data();
+ tensor_mem(this->cpu().input,(void*)bottom_data);
+ cpu_run();
+ tensor_mem((void*)top_data,this->cpu().output);
+ }
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Forward_gpu(
+ const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONV_INFO);
+#endif //USE_PROFILING
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ if (this->force_bypass_acl_path_|| this->group_!=1) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 ) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ /* check dilation */
+ int dilated=0;
+
+ for(int i=0;i<this->num_spatial_axes_;i++)
+ {
+ const int *p=this->dilation_.gpu_data();
+
+ if(p[i]!=1)
+ dilated=1;
+ }
+
+ if(dilated) {
+ ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ SetupACLLayer(bottom,top);
+ for (int i = 0; i < bottom.size(); ++i) {
+ const Dtype* bottom_data = bottom[i]->gpu_data();
+ Dtype* top_data = top[i]->mutable_gpu_data();
+ tensor_mem(this->gpu().input,(void*)bottom_data);
+ gpu_run();
+ tensor_mem((void*)top_data,this->gpu().output);
+ }
+}
+
+template <typename Dtype>
+ACLConvolutionLayer<Dtype>::~ACLConvolutionLayer() {
+}
+
+INSTANTIATE_CLASS(ACLConvolutionLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_inner_product_layer.cpp b/src/caffe/layers/acl_inner_product_layer.cpp
new file mode 100644
index 00000000..6e7a46e9
--- /dev/null
+++ b/src/caffe/layers/acl_inner_product_layer.cpp
@@ -0,0 +1,131 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/acl_inner_product_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ InnerProductLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_FC;
+}
+template <typename Dtype>
+void ACLInnerProductLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+
+ TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_);
+ TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_);
+ TensorShape biases_shape((unsigned int)this->N_);
+ TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_);
+ TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_);
+ checkreshape(input_shape,Caffe::arm_gpu_mode());
+ if (!this->init_layer_) return;
+ this->init_layer_=false;
+ // Initialize ACL.
+ if (Caffe::arm_gpu_mode()) {
+ new_gpulayer();
+ }else{
+ new_cpulayer();
+ }
+
+ bool transpose = !this->layer_param_.inner_product_param().transpose();
+ this->force_bypass_acl_path_ = false;
+ if (Caffe::arm_gpu_mode()) {
+ Dtype *top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ if (transpose) {
+ this->gpu().weights=new_tensor<GPUTensor>(weights_shape_t,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
+ }else{
+ this->gpu().weights=new_tensor<GPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
+ }
+ tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
+ if (this->bias_term_) {
+ this->gpu().biases=new_tensor<GPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
+ tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
+ }
+ this->gpu().input=new_tensor<GPUTensor>(input_shape,(void*)bottom_data);
+ this->gpu().output=new_tensor<GPUTensor>(output_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,transpose);
+ }else{
+ Dtype *top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ if (transpose) {
+ this->cpu().weights=new_tensor<CPUTensor>(weights_shape_t,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
+ }else{
+ this->cpu().weights=new_tensor<CPUTensor>(weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
+ }
+ tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
+ if (this->bias_term_) {
+ this->cpu().biases=new_tensor<CPUTensor>(biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
+ tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
+ }
+ this->cpu().input=new_tensor<CPUTensor>(input_shape,(void*)bottom_data);
+ this->cpu().output=new_tensor<CPUTensor>(output_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,transpose);
+ }
+}
+template <typename Dtype>
+void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ InnerProductLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ if(Caffe::arm_gpu_mode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_FC_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ Dtype* top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ SetupACLLayer(bottom,top);
+ tensor_mem(this->cpu().input,(void*)(bottom_data));
+ cpu_run();
+ tensor_mem((void*)(top_data),this->cpu().output);
+}
+
+template <typename Dtype>
+void ACLInnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_FC_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ Dtype* top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ SetupACLLayer(bottom,top);
+ tensor_mem(this->gpu().input,(void*)(bottom_data));
+ gpu_run();
+ tensor_mem((void*)(top_data),this->gpu().output);
+}
+
+template <typename Dtype>
+ACLInnerProductLayer<Dtype>::~ACLInnerProductLayer() {
+}
+
+INSTANTIATE_CLASS(ACLInnerProductLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_lrn_layer.cpp b/src/caffe/layers/acl_lrn_layer.cpp
new file mode 100644
index 00000000..9c03cad1
--- /dev/null
+++ b/src/caffe/layers/acl_lrn_layer.cpp
@@ -0,0 +1,144 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/layers/acl_lrn_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ LRNLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+}
+template <typename Dtype>
+void ACLLRNLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+
+ TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_);
+ checkreshape(shape,Caffe::arm_gpu_mode());
+ if (!this->init_layer_) return;
+ // Initialize ACL.
+ if (Caffe::arm_gpu_mode()) {
+ new_gpulayer();
+ }else{
+ new_cpulayer();
+ }
+
+ //this->force_bypass_acl_path_=false;
+ NormalizationLayerInfo *norm_info;
+ if(this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL)
+ norm_info=new NormalizationLayerInfo(NormType::IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+ else
+ norm_info=new NormalizationLayerInfo(NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+
+ if (Caffe::arm_gpu_mode()) {
+ Dtype *top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ this->gpu().input=new_tensor<GPUTensor>(shape,(void*)bottom_data);
+ this->gpu().output=new_tensor<GPUTensor>(shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->gpu().layer->configure(this->gpu().input,this->gpu().output,*norm_info);
+ }else{
+ Dtype *top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ this->cpu().input=new_tensor<CPUTensor>(shape,(void*)bottom_data);
+ this->cpu().output=new_tensor<CPUTensor>(shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->cpu().layer->configure(this->cpu().input,this->cpu().output,*norm_info);
+ }
+ delete norm_info;
+}
+template <typename Dtype>
+void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ LRNLayer<Dtype>::Reshape(bottom, top);
+ return;
+}
+
+template <typename Dtype>
+void ACLLRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ if(Caffe::arm_gpu_mode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_LRN_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) {
+ LRNLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ Dtype* top_data = top[0]->mutable_cpu_data();
+ SetupACLLayer(bottom,top);
+ switch (this->layer_param_.lrn_param().norm_region()) {
+ case LRNParameter_NormRegion_ACROSS_CHANNELS:
+ for (int n = 0; n < this->num_; ++n) {
+ tensor_mem(this->cpu().input,(void*)(bottom_data+ bottom[0]->offset(n)));
+ cpu_run();
+ tensor_mem((void*)(top_data + top[0]->offset(n)),this->cpu().output);
+ }
+ break;
+ case LRNParameter_NormRegion_WITHIN_CHANNEL:
+ for (int n = 0; n < bottom[0]->num(); ++n) {
+ tensor_mem(this->cpu().input,(void*)(bottom_data));
+ cpu_run();
+ tensor_mem((void*)(top_data),this->cpu().output);
+ bottom_data += bottom[0]->offset(0, 1);
+ top_data += top[0]->offset(0, 1);
+ }
+ break;
+ default:
+ LOG(FATAL) << "Unknown normalization region.";
+ }
+}
+
+template <typename Dtype>
+void ACLLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_LRN_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ LRNLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ Dtype* top_data = top[0]->mutable_gpu_data();
+ SetupACLLayer(bottom,top);
+ switch (this->layer_param_.lrn_param().norm_region()) {
+ case LRNParameter_NormRegion_ACROSS_CHANNELS:
+ for (int n = 0; n < this->num_; ++n) {
+ tensor_mem(this->gpu().input,(void*)(bottom_data+ bottom[0]->offset(n)));
+ gpu_run();
+ tensor_mem((void*)(top_data + top[0]->offset(n)),this->gpu().output);
+ }
+ break;
+ case LRNParameter_NormRegion_WITHIN_CHANNEL:
+ for (int n = 0; n < bottom[0]->num(); ++n) {
+ tensor_mem(this->gpu().input,(void*)(bottom_data));
+ gpu_run();
+ tensor_mem((void*)(top_data),this->gpu().output);
+ bottom_data += bottom[0]->offset(0, 1);
+ top_data += top[0]->offset(0, 1);
+ }
+ break;
+ default:
+ LOG(FATAL) << "Unknown normalization region.";
+ }
+}
+
+template <typename Dtype>
+ACLLRNLayer<Dtype>::~ACLLRNLayer() {
+}
+
+INSTANTIATE_CLASS(ACLLRNLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_pooling_layer.cpp b/src/caffe/layers/acl_pooling_layer.cpp
new file mode 100644
index 00000000..951de3e4
--- /dev/null
+++ b/src/caffe/layers/acl_pooling_layer.cpp
@@ -0,0 +1,150 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/layers/acl_pooling_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ PoolingLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+}
+template <typename Dtype>
+void ACLPoolingLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+
+ TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_);
+ TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_);
+ checkreshape(in_shape,Caffe::arm_gpu_mode());
+ if (!this->init_layer_) return;
+ this->init_layer_=false;
+ // Initialize ACL.
+ if (Caffe::arm_gpu_mode()) {
+ new_gpulayer();
+ }else{
+ new_cpulayer();
+ }
+
+ this->force_bypass_acl_path_=false;
+ PoolingLayerInfo *pool_info;
+ if(this->layer_param_.pooling_param().pool()==PoolingParameter_PoolMethod_MAX)
+ pool_info=new PoolingLayerInfo(PoolingType::MAX, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL));
+ else
+ pool_info=new PoolingLayerInfo(PoolingType::AVG, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL));
+
+ if (Caffe::arm_gpu_mode()) {
+ Dtype *top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ this->gpu().input=new_tensor<GPUTensor>(in_shape,(void*)bottom_data);
+ this->gpu().output=new_tensor<GPUTensor>(out_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->gpu().layer->configure(this->gpu().input,this->gpu().output,*pool_info);
+ }else{
+ Dtype *top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ this->cpu().input=new_tensor<CPUTensor>(in_shape,(void*)bottom_data);
+ this->cpu().output=new_tensor<CPUTensor>(out_shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->cpu().layer->configure(this->cpu().input,this->cpu().output,*pool_info);
+ }
+ delete pool_info;
+}
+template <typename Dtype>
+void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ PoolingLayer<Dtype>::Reshape(bottom, top);
+
+}
+
+template <typename Dtype>
+void ACLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ if(Caffe::arm_gpu_mode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_POOLING_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ Dtype* top_data = top[0]->mutable_cpu_data();
+ if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX &&
+ this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ if (this->kernel_h_!=this->kernel_w_ || top.size()>1) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ if (this->kernel_h_!=2 && this->kernel_h_!=3) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ SetupACLLayer(bottom,top);
+ for (int n = 0; n < bottom[0]->num(); ++n) {
+ for (int c = 0; c < this->channels_; ++c) {
+ tensor_mem(this->cpu().input,(void*)(bottom_data));
+ cpu_run();
+ tensor_mem((void*)(top_data),this->cpu().output);
+ bottom_data += bottom[0]->offset(0, 1);
+ top_data += top[0]->offset(0, 1);
+ }
+ }
+}
+
+template <typename Dtype>
+void ACLPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_POOLING_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ Dtype* top_data = top[0]->mutable_gpu_data();
+ if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX &&
+ this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ if (this->kernel_h_!=this->kernel_w_) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ if (this->kernel_h_!=2 && this->kernel_h_!=3) {
+ PoolingLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ SetupACLLayer(bottom,top);
+ for (int n = 0; n < bottom[0]->num(); ++n) {
+ for (int c = 0; c < this->channels_; ++c) {
+ tensor_mem(this->gpu().input,(void*)(bottom_data));
+ gpu_run();
+ tensor_mem((void*)(top_data),this->gpu().output);
+ bottom_data += bottom[0]->offset(0, 1);
+ top_data += top[0]->offset(0, 1);
+ }
+ }
+}
+
+template <typename Dtype>
+ACLPoolingLayer<Dtype>::~ACLPoolingLayer() {
+}
+
+INSTANTIATE_CLASS(ACLPoolingLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_relu_layer.cpp b/src/caffe/layers/acl_relu_layer.cpp
new file mode 100644
index 00000000..03194539
--- /dev/null
+++ b/src/caffe/layers/acl_relu_layer.cpp
@@ -0,0 +1,70 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/layers/acl_relu_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ ReLULayer<Dtype>::LayerSetUp(bottom, top);
+ ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+}
+template <typename Dtype>
+void ACLReLULayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+ ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::RELU);
+}
+template <typename Dtype>
+void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ ReLULayer<Dtype>::Reshape(bottom, top);
+ ACLBaseActivationLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_RELU_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ ReLULayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ // Fallback to standard Caffe for leaky ReLU.
+ if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
+ ReLULayer<Dtype>::Forward_cpu(bottom, top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
+}
+
+template <typename Dtype>
+void ACLReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_RELU_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ ReLULayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ // Fallback to standard Caffe for leaky ReLU.
+ if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
+ ReLULayer<Dtype>::Forward_cpu(bottom, top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top);
+}
+
+template <typename Dtype>
+ACLReLULayer<Dtype>::~ACLReLULayer() {
+}
+
+INSTANTIATE_CLASS(ACLReLULayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_sigmoid_layer.cpp b/src/caffe/layers/acl_sigmoid_layer.cpp
new file mode 100644
index 00000000..eac15651
--- /dev/null
+++ b/src/caffe/layers/acl_sigmoid_layer.cpp
@@ -0,0 +1,61 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/layers/acl_sigmoid_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ SigmoidLayer<Dtype>::LayerSetUp(bottom, top);
+ ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SIGMOID;
+}
+
+template <typename Dtype>
+void ACLSigmoidLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::LOGISTIC);
+}
+template <typename Dtype>
+void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ SigmoidLayer<Dtype>::Reshape(bottom, top);
+ ACLBaseActivationLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLSigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_SIGMOID_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ SigmoidLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
+}
+
+template <typename Dtype>
+void ACLSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_SIGMOID_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ SigmoidLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top);
+}
+
+template <typename Dtype>
+ACLSigmoidLayer<Dtype>::~ACLSigmoidLayer() {
+}
+
+INSTANTIATE_CLASS(ACLSigmoidLayer);
+
+} // namespace caffe
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_softmax_layer.cpp b/src/caffe/layers/acl_softmax_layer.cpp
new file mode 100644
index 00000000..d32460b9
--- /dev/null
+++ b/src/caffe/layers/acl_softmax_layer.cpp
@@ -0,0 +1,117 @@
+#ifdef USE_ACL
+#include <vector>
+#include "caffe/layers/acl_softmax_layer.hpp"
+#include <unistd.h>
+
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+}
+template <typename Dtype>
+void ACLSoftmaxLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top){
+
+ unsigned int channels = bottom[0]->shape(this->softmax_axis_);
+ TensorShape shape(channels*this->inner_num_);
+ checkreshape(shape,Caffe::arm_gpu_mode());
+ if (!this->init_layer_) return;
+ this->init_layer_=false;
+
+ // Initialize ACL.
+ if (Caffe::arm_gpu_mode()) {
+ new_gpulayer();
+ }else{
+ new_cpulayer();
+ }
+
+ //this->force_bypass_acl_path_=false;
+ if (Caffe::arm_gpu_mode()) {
+ Dtype *top_data = top[0]->mutable_gpu_data();
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ this->gpu().input=new_tensor<GPUTensor>(shape,(void*)bottom_data);
+ this->gpu().output=new_tensor<GPUTensor>(shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->gpu().layer->configure(this->gpu().input,this->gpu().output);
+ }else{
+ Dtype *top_data = top[0]->mutable_cpu_data();
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ this->cpu().input=new_tensor<CPUTensor>(shape,(void*)bottom_data);
+ this->cpu().output=new_tensor<CPUTensor>(shape,(void*)top_data);
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_CONFIG_INFO);
+#endif //USE_PROFILING
+ this->cpu().layer->configure(this->cpu().input,this->cpu().output);
+ }
+}
+template <typename Dtype>
+void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ SoftmaxLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLSoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ if(Caffe::arm_gpu_mode()){
+ Forward_gpu(bottom, top);
+ return;
+ }
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_SOFTMAX_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_ || this->inner_num_>1) {
+ SoftmaxLayer<Dtype>::Forward_cpu(bottom,top);
+ return ;
+ }
+ const Dtype* bottom_data = bottom[0]->cpu_data();
+ Dtype* top_data = top[0]->mutable_cpu_data();
+ SetupACLLayer(bottom,top);
+
+ int channels = bottom[0]->shape(this->softmax_axis_);
+
+ for (int i = 0; i < this->outer_num_; ++i) {
+ tensor_mem(this->cpu().input,(void*)(bottom_data));
+ cpu_run();
+ tensor_mem((void*)(top_data),this->cpu().output);
+ top_data += channels;
+ bottom_data += channels;
+ }
+}
+
+template <typename Dtype>
+void ACLSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_SOFTMAX_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_|| this->inner_num_>1) {
+ SoftmaxLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ const Dtype* bottom_data = bottom[0]->gpu_data();
+ Dtype* top_data = top[0]->mutable_gpu_data();
+ SetupACLLayer(bottom,top);
+ for (int i = 0; i < this->outer_num_; ++i) {
+ tensor_mem(this->gpu().input,(void*)(bottom_data));
+ gpu_run();
+ tensor_mem((void*)(top_data),this->gpu().output);
+ top_data += this->inner_num_;
+ bottom_data += this->inner_num_;
+ }
+}
+
+template <typename Dtype>
+ACLSoftmaxLayer<Dtype>::~ACLSoftmaxLayer() {
+}
+
+INSTANTIATE_CLASS(ACLSoftmaxLayer);
+} // namespace caffe
+
+#endif // USE_ACL
diff --git a/src/caffe/layers/acl_tanh_layer.cpp b/src/caffe/layers/acl_tanh_layer.cpp
new file mode 100644
index 00000000..a1bb632c
--- /dev/null
+++ b/src/caffe/layers/acl_tanh_layer.cpp
@@ -0,0 +1,63 @@
+#ifdef USE_ACL
+#include <vector>
+
+#include "caffe/layers/acl_tanh_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ACLTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ TanHLayer<Dtype>::LayerSetUp(bottom, top);
+ ACLBaseActivationLayer<Dtype>::LayerSetUp(bottom, top);
+ this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_TANH;
+}
+
+template <typename Dtype>
+void ACLTanHLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){
+ ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::TANH);
+}
+
+template <typename Dtype>
+void ACLTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+ TanHLayer<Dtype>::Reshape(bottom, top);
+ ACLBaseActivationLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+void ACLTanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_TANH_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ TanHLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
+}
+
+template <typename Dtype>
+void ACLTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+ logtime_util log_time(ACL_TANH_INFO);
+#endif //USE_PROFILING
+ if (this->force_bypass_acl_path_) {
+ TanHLayer<Dtype>::Forward_cpu(bottom,top);
+ return;
+ }
+ ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom,top);
+}
+
+template <typename Dtype>
+ACLTanHLayer<Dtype>::~ACLTanHLayer() {
+}
+
+INSTANTIATE_CLASS(ACLTanHLayer);
+
+} // namespace caffe
+
+#endif // USE_ACL
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 448d86d7..5293373d 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -42,6 +42,8 @@ STUB_GPU(BNLLLayer);
#endif
INSTANTIATE_CLASS(BNLLLayer);
+#ifndef USE_ACL
REGISTER_LAYER_CLASS(BNLL);
+#endif
} // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 00716a92..7668854c 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
/*
TODO:
- load file in a separate thread ("prefetch")
@@ -184,3 +185,4 @@ INSTANTIATE_CLASS(HDF5DataLayer);
REGISTER_LAYER_CLASS(HDF5Data);
} // namespace caffe
+#endif // USE_HDF5
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 33eebd41..70cd9f32 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
/*
TODO:
- only load parts of the file, in accordance with a prototxt param "max_mem"
@@ -34,3 +35,4 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer);
} // namespace caffe
+#endif // USE_HDF5
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f8f1edcd..28c453a2 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include <vector>
#include "hdf5.h"
@@ -72,3 +73,4 @@ INSTANTIATE_CLASS(HDF5OutputLayer);
REGISTER_LAYER_CLASS(HDF5Output);
} // namespace caffe
+#endif // USE_HDF5
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
index c1685cd3..891aea03 100644
--- a/src/caffe/layers/hdf5_output_layer.cu
+++ b/src/caffe/layers/hdf5_output_layer.cu
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include <vector>
#include "hdf5.h"
@@ -37,3 +38,4 @@ void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer);
} // namespace caffe
+#endif // USE_HDF5
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index e65349f0..be3791e4 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -145,6 +145,8 @@ STUB_GPU(InnerProductLayer);
#endif
INSTANTIATE_CLASS(InnerProductLayer);
+#ifndef USE_ACL
REGISTER_LAYER_CLASS(InnerProduct);
+#endif
} // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 353c2f95..7bf33e1d 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -5,7 +5,9 @@
#include <utility>
#include <vector>
+#ifdef USE_HDF5
#include "hdf5.h"
+#endif // USE_HDF5
#include "caffe/common.hpp"
#include "caffe/layer.hpp"
@@ -769,7 +771,12 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
template <typename Dtype>
void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
+#ifdef USE_HDF5
if (H5Fis_hdf5(trained_filename.c_str())) {
+#else
+ if (trained_filename.size() >= 3 &&
+ trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+#endif
CopyTrainedLayersFromHDF5(trained_filename);
} else {
CopyTrainedLayersFromBinaryProto(trained_filename);
@@ -786,6 +793,7 @@ void Net<Dtype>::CopyTrainedLayersFromBinaryProto(
template <typename Dtype>
void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
+#ifdef USE_HDF5
hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY,
H5P_DEFAULT);
CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename;
@@ -832,6 +840,10 @@ void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
}
H5Gclose(data_hid);
H5Fclose(file_hid);
+#else
+ LOG(FATAL) << "CopyTrainedLayersFromHDF5 requires hdf5;"
+ << " compile with USE_HDF5.";
+#endif // USE_HDF5
}
template <typename Dtype>
@@ -848,6 +860,7 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
template <typename Dtype>
void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
+#ifdef USE_HDF5
hid_t file_hid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
H5P_DEFAULT);
CHECK_GE(file_hid, 0)
@@ -901,6 +914,9 @@ void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
H5Gclose(diff_hid);
}
H5Fclose(file_hid);
+#else
+ LOG(FATAL) << "ToHDF5 requires hdf5; compile with USE_HDF5.";
+#endif // USE_HDF5
}
template <typename Dtype>
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index ad6abe54..725602ab 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -278,6 +278,7 @@ void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
template <typename Dtype>
void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
const string& model_filename) {
+#ifdef USE_HDF5
string snapshot_filename =
Solver<Dtype>::SnapshotFilename(".solverstate.h5");
LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
@@ -299,6 +300,10 @@ void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
}
H5Gclose(history_hid);
H5Fclose(file_hid);
+#else
+ LOG(FATAL) << "SnapshotSolverStateToHDF5 requires hdf5;"
+ << " compile with USE_HDF5.";
+#endif // USE_HDF5
}
template <typename Dtype>
@@ -323,6 +328,7 @@ void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
template <typename Dtype>
void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
+#ifdef USE_HDF5
hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
this->iter_ = hdf5_load_int(file_hid, "iter");
@@ -344,6 +350,10 @@ void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
}
H5Gclose(history_hid);
H5Fclose(file_hid);
+#else
+ LOG(FATAL) << "RestoreSolverStateFromHDF5 requires hdf5;"
+ << " compile with USE_HDF5.";
+#endif // USE_HDF5
}
INSTANTIATE_CLASS(SGDSolver);
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 88d9b785..d74ad31e 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -54,8 +54,10 @@ inline void SyncedMemory::to_cpu() {
caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
#else
+#ifndef USE_ACL
NO_GPU;
#endif
+#endif
break;
case HEAD_AT_CPU:
case SYNCED:
@@ -113,9 +115,14 @@ const void* SyncedMemory::gpu_data() {
to_gpu();
return (const void*)gpu_ptr_;
#else
+#ifdef USE_ACL
+ to_cpu();
+ return (const void*)cpu_ptr_;
+#else
NO_GPU;
return NULL;
#endif
+#endif
}
void SyncedMemory::set_gpu_data(void* data) {
@@ -129,8 +136,14 @@ void SyncedMemory::set_gpu_data(void* data) {
head_ = HEAD_AT_GPU;
own_gpu_data_ = false;
#else
+#ifdef USE_ACL
+ gpu_ptr_ = data;
+ head_ = HEAD_AT_GPU;
+ own_gpu_data_ = false;
+#else
NO_GPU;
#endif
+#endif
}
void* SyncedMemory::mutable_cpu_data() {
@@ -147,9 +160,15 @@ void* SyncedMemory::mutable_gpu_data() {
head_ = HEAD_AT_GPU;
return gpu_ptr_;
#else
+#ifdef USE_ACL
+ to_cpu();
+ head_ = HEAD_AT_GPU;
+ return cpu_ptr_;
+#else
NO_GPU;
return NULL;
#endif
+#endif
}
#ifndef CPU_ONLY
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 2bc2de1e..3b0139d5 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include <string>
#include <vector>
@@ -121,3 +122,4 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
}
} // namespace caffe
+#endif //USE_HDF5 \ No newline at end of file
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 487f5176..7d27d77c 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include <string>
#include <vector>
@@ -164,3 +165,4 @@ TYPED_TEST(HDF5DataLayerTest, TestSkip) {
}
} // namespace caffe
+#endif //USE_HDF5 \ No newline at end of file
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index ed737429..cefd853d 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
#include "caffe/util/hdf5.hpp"
#include <string>
@@ -207,3 +208,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx) {
}
} // namespace caffe
+#endif // USE_HDF5
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c02274..b0dd3020 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -85,6 +85,9 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) {
template <typename Dtype>
void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
if (X != Y) {
+#ifdef USE_ACL
+ memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn)
+#else
if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
// NOLINT_NEXT_LINE(caffe/alt_fn)
@@ -95,6 +98,7 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
} else {
memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn)
}
+#endif
}
}
diff --git a/unit_tests/Makefile b/unit_tests/Makefile
new file mode 100644
index 00000000..034ee02f
--- /dev/null
+++ b/unit_tests/Makefile
@@ -0,0 +1,87 @@
+
+include ../Makefile.config
+
+CXX_SRCS+=test_pooling_layer.cpp
+CXX_SRCS+=test_softmax_layer.cpp
+CXX_SRCS+= test_inner_product_layer.cpp
+CXX_SRCS+=test_neuron_layer.cpp
+CXX_SRCS+=test_lrn_layer.cpp #failed on single channel LRN
+#C_SRCS=pmu.c testbed.c
+CXX_SRCS+= test_convolution_layer.cpp
+#CXX_SRCS= test_fail.cpp
+BIN_SRCS=test_caffe_main.cpp test.cpp
+
+HOME=/home/firefly
+#
+# caffe related stuff
+#
+
+CAFFE_ROOT=$(HOME)/caffeOnACL
+CAFFE_INCS = -I$(CAFFE_ROOT)/include -I$(CAFFE_ROOT)/distribute/include/
+CAFFE_LIBS = -L$(CAFFE_ROOT)/distribute/lib -lcaffe -lglog -lgflags -lprotobuf -lboost_system -lboost_filesystem
+CAFFE_RPATH =$(CAFFE_ROOT)/distribute/lib
+
+
+#
+# google test related stuff
+#
+
+GTEST_ROOT=/usr/local
+GTEST_LIBS=$(GTEST_ROOT)/lib/libgtest_main.a $(GTEST_ROOT)/lib/libgtest.a
+GTEST_INCS=-I$(GTEST_ROOT)/include/
+
+
+#
+# arm compute
+#
+
+ACL_ROOT=$(HOME)/ComputeLibrary
+ACL_LIBS=-L$(ACL_ROOT)/build -L$(ACL_ROOT)/build/arm_compute -larm_compute -lOpenCL
+ACL_RPATH=$(ACL_ROOT)/build:$(ACL_ROOT)/build/arm_compute
+
+COMM_CFLAGS=$(GTEST_INCS) $(CAFFE_INCS) -Wall -g -DCPU_ONLY -DUSE_ACL
+#USE_PROFILING -- get profiling informations, is controled by LOGACL
+#LAYER_PERF_STAT -- haitao's net profiling information
+ifeq ($(USE_PROFILING), 1)
+ COMM_CFLAGS += -DUSE_PROFILING -DLAYER_PERF_STAT
+endif
+
+CXXFLAGS=$(COMM_CFLAGS) -Wno-sign-compare
+CFLAGS=$(COMM_CFLAGS)
+
+CC=gcc
+CXX=g++
+
+COMM_OBJS=$(CXX_SRCS:.cpp=.o) $(C_SRCS:.c=.o)
+BIN_OBJS+=$(BIN_SRCS:.cpp=.o)
+BIN_EXES=$(BIN_OBJS:.o=)
+
+LIBS+=$(CAFFE_LIBS) $(GTEST_LIBS) -lpthread -lopenblas $(ACL_LIBS)
+
+RT_PATH=-Wl,-rpath,$(CAFFE_RPATH):$(ACL_RPATH)
+
+LDFLAGS+=$(RT_PATH)
+
+
+all : $(BIN_EXES)
+
+$(BIN_EXES):%:%.o
+
+$(BIN_EXES):$(COMM_OBJS)
+
+
+
+clean:
+ rm -f $(BIN_EXES) *.o *.so
+
+.PHONY : all clean
+
+%.o : %.c
+ $(CC) $(CFLAGS) -c $< -o $@
+
+%.o : %.cpp
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+%: %.o
+ $(CXX) $(LDFLAGS) $< $(COMM_OBJS) -o $@ $(LIBS)
+
diff --git a/unit_tests/pmu.c b/unit_tests/pmu.c
new file mode 100644
index 00000000..8c2f6b31
--- /dev/null
+++ b/unit_tests/pmu.c
@@ -0,0 +1,376 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pmu.h"
+
+#define MAX_SLOT_NUM 64 /* defined by ARMv8 SPEC*/
+#define ARMV8_PMCR_N_SHIFT 11 /* Number of counters supported */
+#define ARMV8_PMCR_N_MASK 0x1f
+
+
+static __thread struct pmu_event_record * g_rec_ptr[MAX_SLOT_NUM];
+static __thread int max_counter_slot;
+
+/* start and stop counter */
+
+static void stop_event_profile(struct pmu_event_record * p_record)
+{
+ int slot=p_record->p_evt->slot;
+
+ p_record->p_evt->enabled=0;
+
+ stop_pmu_counter(slot);
+}
+
+
+static void init_pmu_event_record(struct pmu_event * p_evt, struct pmu_event_record * p_record)
+{
+ struct prof_stat *p_stat;
+ int i=0;
+
+ p_record->p_evt=p_evt;
+ p_record->last_val=p_evt->init_val;
+ p_record->base_val=p_evt->init_val;
+ p_stat=p_record->prof_stat;
+
+ for(i=0;i<MAX_PROF_POINTS;i++)
+ {
+ p_stat[i].prof_seq=i;
+ p_stat[i].max_val=0;
+ p_stat[i].min_val=-1U;
+ p_stat[i].raw_val=0xdeadbeaf;
+ p_stat[i].cur_val=0;
+ p_stat[i].total_val=0;
+ p_stat[i].enter_count=0;
+ }
+}
+
+static void start_event_profile(struct pmu_event_record * p_record)
+{
+ int slot=p_record->p_evt->slot;
+ struct prof_stat *p_stat;
+ int i;
+
+ p_record->p_evt->enabled=1;
+
+ p_stat=p_record->prof_stat;
+
+ for(i=0;i<MAX_PROF_POINTS;i++)
+ {
+ p_stat[i].prof_seq=i;
+ p_stat[i].max_val=0;
+ p_stat[i].min_val=-1U;
+ p_stat[i].raw_val=0xdeadbeaf;
+ p_stat[i].cur_val=0;
+ p_stat[i].total_val=0;
+ p_stat[i].enter_count=0;
+ }
+
+ write_pmu_counter(slot,p_record->p_evt->init_val);
+ start_pmu_counter(slot);
+
+}
+
+/* create event and profile */
+
+
+int setup_event_counter(int slot, int event_id)
+{
+
+ if(slot==31)
+ return 0;
+
+ if(event_id>1023)
+ return -1;
+
+ write_32bit_sysreg(PMSELR_EL0,slot);
+ write_32bit_sysreg(PMXEVTYPER_EL0,event_id);
+
+ return 0;
+}
+
+
+static struct pmu_event_record * create_pmu_event_record(char *name, int slot,
+ int event_id, uint32_t init_val, char * note)
+{
+ struct pmu_event * p_evt;
+ struct pmu_event_record * p_record;
+
+ if(setup_event_counter(slot,event_id)<0)
+ return NULL;
+
+ p_evt=malloc(sizeof(struct pmu_event));
+
+ if(p_evt==NULL)
+ return NULL;
+
+ p_evt->name=name;
+ p_evt->slot=slot;
+ p_evt->event_id=event_id;
+ p_evt->init_val=init_val;
+ p_evt->note=note;
+ p_evt->enabled=0;
+
+ p_record=malloc(sizeof(struct pmu_event_record));
+
+ if(p_record==NULL)
+ {
+ free(p_evt);
+ return NULL;
+ }
+
+ p_record->p_evt=p_evt;
+
+ init_pmu_event_record(p_evt,p_record);
+
+ return p_record;
+}
+
+
+static void record_event_prof(struct pmu_event_record * p_record,
+ int prof_seq, int cal_offset, int update_last)
+{
+ struct prof_stat * p_stat;
+ uint32_t evt_val;
+
+ evt_val=read_pmu_counter(p_record->p_evt->slot);
+
+ p_stat=&p_record->prof_stat[prof_seq];
+
+ p_stat->cal_offset=cal_offset;
+ p_stat->update_last=update_last;
+ p_stat->raw_val=evt_val;
+
+ if(cal_offset)
+ p_stat->cur_val=evt_val-p_record->last_val;
+ else
+ p_stat->cur_val=evt_val-p_record->base_val;
+
+ if(update_last)
+ p_record->last_val=evt_val;
+
+ p_stat->total_val+=p_stat->cur_val;
+
+ if(p_stat->cur_val>p_stat->max_val)
+ p_stat->max_val=p_stat->cur_val;
+
+ if(p_stat->cur_val<p_stat->min_val)
+ p_stat->min_val=p_stat->cur_val;
+
+ p_stat->enter_count++;
+}
+
+
+static void release_pmu_event_record(struct pmu_event_record * p_record)
+{
+ struct pmu_event * p_evt;
+
+ p_evt=p_record->p_evt;
+
+ if(p_evt->enabled)
+ stop_pmu_counter(p_evt->slot);
+
+ free(p_evt);
+ free(p_record);
+}
+
+
+
+
+/* debugging */
+
+static void dump_pmu_event(struct pmu_event * p_evt)
+{
+
+ printf("event[%s/0x%x]: slot [%d] init_val[0x%x] enabled[%d]",
+ p_evt->name,p_evt->event_id,p_evt->slot,p_evt->init_val,
+ p_evt->enabled);
+
+ if(p_evt->note)
+ printf(" note[%s]\n",p_evt->note);
+ else
+ printf("\n");
+}
+
+static void dump_pmu_event_record(struct pmu_event_record * p_record)
+{
+ int i;
+ struct prof_stat * p_stat;
+ uint64_t total_avg_val=0;
+ int count=0;
+ uint32_t avg;
+
+ printf("------------------------------------------------------------------------\n");
+
+ dump_pmu_event(p_record->p_evt);
+
+ p_stat=&p_record->prof_stat[0];
+
+ for(i=0;i<MAX_PROF_POINTS;i++)
+ {
+ if(p_stat[i].enter_count==0)
+ continue;
+
+ avg=(uint32_t)(p_stat[i].total_val/p_stat[i].enter_count);
+
+ printf("stat [%d]: max/min/avg [0x%x/0x%x/0x%x] total [0x%lx] count[%u]\n",
+ i,p_stat[i].max_val,p_stat[i].min_val,
+ avg,
+ p_stat[i].total_val,p_stat[i].enter_count);
+ printf(" raw_val[0x%x] cal_offset[%d] update_last[%d]\n",
+ p_stat[i].raw_val,p_stat[i].cal_offset,p_stat[i].update_last);
+
+ count++;
+
+ total_avg_val+=avg;
+
+
+ }
+
+ printf("total [%d] points, the sum of average number is: [0x%lx]\n\n",count,total_avg_val);
+}
+
+/* output interface */
+
+void init_pmu_registers(void)
+{
+ /* enabled PMU in PMCR*/
+ write_32bit_sysreg(PMCR_EL0,0x1);
+ max_counter_slot=(read_32bit_sysreg(PMCR_EL0) >> ARMV8_PMCR_N_SHIFT)&ARMV8_PMCR_N_MASK;
+}
+
+
+#define dump_32bit_sysreg(reg) \
+ printf(__stringify(reg) " is [0x%08x]\n",read_32bit_sysreg(reg))
+
+#define dump_64bit_sysreg(reg) \
+ printf(__stringify(reg) " is [0x%016llx]\n",read_32bit_sysreg(reg))
+
+
+void dump_pmu_registers(void)
+{
+ dump_32bit_sysreg(PMCEID0_EL0);
+ dump_32bit_sysreg(PMCEID1_EL0);
+ dump_32bit_sysreg(PMOVSSET_EL0);
+ dump_32bit_sysreg(PMCR_EL0);
+ dump_32bit_sysreg(PMUSERENR_EL0);
+ dump_32bit_sysreg(PMCNTENSET_EL0);
+}
+
+struct pmu_event_record * get_pmu_event_record(int slot)
+{
+ return g_rec_ptr[slot];
+}
+
+int create_pmu_event(char *name,int event_id,
+ uint32_t init_val, char * note)
+{
+ int i;
+
+ struct pmu_event_record * p_record;
+
+ for(i=0;i<max_counter_slot;i++)
+ {
+ if(g_rec_ptr[i]==NULL)
+ break;
+ }
+
+ if(i==max_counter_slot)
+ return -1;
+
+ p_record=create_pmu_event_record(name,i,event_id,init_val,note);
+
+ if(p_record==NULL)
+ return -1;
+
+ g_rec_ptr[i]=p_record;
+
+ return i;
+}
+
+void release_pmu_event(int slot)
+{
+ struct pmu_event_record * p_record;
+
+ p_record=g_rec_ptr[slot];
+
+ if(p_record)
+ release_pmu_event_record(p_record);
+
+ g_rec_ptr[slot]=NULL;
+}
+
+void start_pmu_event(int slot)
+{
+ struct pmu_event_record * p_record;
+
+ p_record=g_rec_ptr[slot];
+
+ start_event_profile(p_record);
+}
+
+void stop_pmu_event(int slot)
+{
+ struct pmu_event_record * p_record;
+
+ p_record=g_rec_ptr[slot];
+
+ stop_event_profile(p_record);
+}
+
+void record_pmu_event(int slot, int seq, int cal_offset, int update_last)
+{
+ struct pmu_event_record * p_record;
+
+ p_record=g_rec_ptr[slot];
+
+ record_event_prof(p_record,seq,cal_offset,update_last);
+}
+
+void dump_pmu_event_stat(int slot)
+{
+ struct pmu_event_record * p_record;
+
+ p_record=g_rec_ptr[slot];
+
+ dump_pmu_event_record(p_record);
+}
+
+
+uint32_t get_pmu_stat_avg(int slot)
+{
+ struct pmu_event_record * p_record;
+ struct prof_stat * p_stat;
+ uint32_t total_avg=0;
+ uint32_t avg;
+ int i;
+
+ p_record=g_rec_ptr[slot];
+
+ for(i=0;i<MAX_PROF_POINTS;i++)
+ {
+ p_stat=&p_record->prof_stat[i];
+
+ if(p_stat->enter_count==0)
+ continue;
+ avg=p_stat->total_val/p_stat->enter_count;
+ total_avg+=avg;
+ }
+
+ return total_avg;
+}
+
+void set_pmu_event_base(int slot)
+{
+ struct pmu_event_record * p_record;
+
+ uint32_t val;
+
+ p_record=g_rec_ptr[slot];
+
+ val=read_pmu_counter(slot);
+
+ p_record->last_val=val;
+ p_record->base_val=val;
+
+}
diff --git a/unit_tests/pmu.h b/unit_tests/pmu.h
new file mode 100644
index 00000000..9c252066
--- /dev/null
+++ b/unit_tests/pmu.h
@@ -0,0 +1,130 @@
+#ifndef AARCH64_PMU_H
+#define AARCH64_PMU_H
+
+#include <stdint.h>
+
+
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+#define read_32bit_sysreg(reg) \
+ ({\
+ uint32_t val;\
+ __asm__ __volatile__ (\
+ "mrs %0," __stringify(reg):"=r"(val));\
+ val;\
+ })
+
+
+#define read_64bit_sysreg(reg) \
+ ({\
+ uint64_t val;\
+ __asm__ __volatile__ (\
+ "mrs %0," __stringify(reg):"=r"(val));\
+ val;\
+ })
+
+#define write_32bit_sysreg(reg,val) \
+ ({\
+ __asm__ __volatile__ (\
+ "msr " __stringify(reg) " ,%0"::"r"(val));\
+ })
+
+#define write_64bit_sysreg(reg,val) write_32bit_sysreg(reg,val)
+
+#define MAX_PROF_POINTS 16
+
+struct pmu_event
+{
+ int slot;
+ int event_id;
+ char * name;
+ uint32_t init_val;
+ int enabled;
+ char * note;
+};
+
+struct prof_stat
+{
+ int prof_seq;
+ uint32_t max_val;
+ uint32_t min_val;
+ uint32_t cur_val;
+ uint32_t raw_val;
+ uint64_t total_val;
+ uint32_t enter_count;
+ int cal_offset;
+ int update_last;
+};
+
+
+struct pmu_event_record
+{
+ struct pmu_event* p_evt;
+ uint32_t last_val;
+ uint32_t base_val;
+ struct prof_stat prof_stat[MAX_PROF_POINTS];
+};
+
+/* all functions in the group must be called on the same CPU */
+
+extern void init_pmu_registers(void);
+extern void dump_pmu_registers(void);
+
+/* create one event with event_id, return slot number in success */
+extern int create_pmu_event(char *name,int event_id,
+ uint32_t init_val, char * note);
+
+extern void release_pmu_event(int slot);
+
+extern void start_pmu_event(int slot);
+
+extern void stop_pmu_event(int slot);
+
+extern void set_pmu_event_base(int slot);
+
+extern void record_pmu_event(int slot, int seq, int cal_offset, int update_last);
+
+extern void dump_pmu_event_stat(int slot);
+
+extern struct pmu_event_record * get_pmu_event_record(int slot);
+
+extern uint32_t get_pmu_stat_avg(int slot); /* adding all phase avg together */
+
+/* regsiter level interface */
+
+extern int setup_event_counter(int slot, int event_id);
+
+static inline void start_pmu_counter(int slot)
+{
+ uint32_t mask=1<<slot;
+
+ write_32bit_sysreg(PMCNTENSET_EL0,mask);
+}
+
+static inline void stop_pmu_counter(int slot)
+{
+ uint32_t mask=1<<slot;
+
+ write_32bit_sysreg(PMCNTENCLR_EL0,mask);
+}
+
+
+static inline void write_pmu_counter(int slot,uint32_t val)
+{
+ write_32bit_sysreg(PMSELR_EL0,slot);
+
+ if(slot<31)
+ write_32bit_sysreg(PMXEVCNTR_EL0, val);
+ else
+ write_64bit_sysreg(PMXEVCNTR_EL0,val);
+
+}
+
+static inline uint32_t read_pmu_counter(int slot)
+{
+ write_32bit_sysreg(PMSELR_EL0,slot);
+ return read_32bit_sysreg(PMXEVCNTR_EL0);
+}
+
+#endif
diff --git a/unit_tests/prof_convolution_layer.cpp b/unit_tests/prof_convolution_layer.cpp
new file mode 100644
index 00000000..01f4fcb7
--- /dev/null
+++ b/unit_tests/prof_convolution_layer.cpp
@@ -0,0 +1,302 @@
+#include <vector>
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/conv_layer.hpp"
+
+#include <glog/logging.h>
+
+extern "C" {
+#include "testbed.h"
+}
+
+
+#define TYPED_TEST(a,b) template <typename TypeParam> void a <TypeParam>:: b (void)
+#define EXPECT_NEAR(a,b,c) {}
+#define EXPECT_EQ(a,b) {}
+
+namespace caffe {
+
+template <typename TypeParam>
+struct CPUDevice {
+ typedef TypeParam Dtype;
+ static const Caffe::Brew device = Caffe::CPU;
+};
+
+
+template <typename TypeParam>
+class ConvolutionLayerTest {
+ typedef typename TypeParam::Dtype Dtype;
+
+public:
+
+ void TestSimpleConvolution(void);
+
+ void TestDilatedConvolution(void);
+
+ void Test0DConvolution(void);
+
+ void TestSimple3DConvolution(void);
+
+ void TestDilated3DConvolution(void);
+
+ void Test1x1Convolution(void);
+
+ void TestSimpleConvolutionGroup(void);
+
+ void TestNDAgainst2D(void);
+
+ void RunConvolution(void);
+
+ ConvolutionLayerTest()
+ : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
+ blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
+ blob_top_(new Blob<Dtype>()),
+ blob_top_2_(new Blob<Dtype>()) {}
+ virtual void SetUp() {
+ // fill the values
+ FillerParameter filler_param;
+ filler_param.set_value(1.);
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ filler.Fill(this->blob_bottom_2_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+
+ virtual ~ConvolutionLayerTest() {
+ delete blob_bottom_;
+ delete blob_bottom_2_;
+ delete blob_top_;
+ delete blob_top_2_;
+ }
+
+ virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
+ this->ref_blob_top_.reset(new Blob<Dtype>());
+ this->ref_blob_top_->ReshapeLike(*top);
+ return this->ref_blob_top_.get();
+ }
+
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_bottom_2_;
+ Blob<Dtype>* const blob_top_;
+ Blob<Dtype>* const blob_top_2_;
+ shared_ptr<Blob<Dtype> > ref_blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+ shared_ptr<Layer<Dtype> > layer;
+};
+
+TYPED_TEST(ConvolutionLayerTest, RunConvolution) {
+
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+
+TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+ layer=shared_ptr<Layer<Dtype> > (new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+
+TYPED_TEST(ConvolutionLayerTest, TestDilatedConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ vector<int> bottom_shape;
+ bottom_shape.push_back(2);
+ bottom_shape.push_back(3);
+ bottom_shape.push_back(8);
+ bottom_shape.push_back(7);
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_dilation(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+ layer=shared_ptr<Layer<Dtype> > (new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ const int kNumOutput = 3;
+ convolution_param->set_num_output(kNumOutput);
+ convolution_param->set_axis(3);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("gaussian");
+ layer=shared_ptr<Layer<Dtype> > (
+ new ConvolutionLayer<Dtype>(layer_param));
+ vector<int> top_shape = this->blob_bottom_->shape();
+ top_shape[3] = kNumOutput;
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(top_shape, this->blob_top_->shape());
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ vector<int> bottom_shape(5);
+ bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+ bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+ bottom_shape[2] = 5;
+ bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
+ bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ filler.Fill(this->blob_bottom_vec_[i]);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("gaussian");
+ layer=shared_ptr<Layer<Dtype> > (
+ new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestDilated3DConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ vector<int> bottom_shape(5);
+ bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+ bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+ bottom_shape[2] = 6;
+ bottom_shape[3] = 7;
+ bottom_shape[4] = 8;
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ filler.Fill(this->blob_bottom_vec_[i]);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_dilation(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("gaussian");
+ layer=shared_ptr<Layer<Dtype> > (
+ new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(1);
+ convolution_param->add_stride(1);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+ layer=shared_ptr<Layer<Dtype> > (
+ new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(3);
+ convolution_param->set_group(3);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+ layer=shared_ptr<Layer<Dtype> > (
+ new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+} // namespace caffe
+
+using namespace caffe;
+ConvolutionLayerTest<CPUDevice<float> > * g_convptr;
+
+void single_forward(void * dummy )
+{
+ g_convptr->RunConvolution();
+}
+
+void forward_convolution(void)
+{
+ run_test(16,0,single_forward,NULL);
+}
+
+#define RUN_FUNC(test_case) test_ ## test_case ()
+
+#define DEF_TEST_FUNC(test_case) \
+void test_## test_case (void)\
+{\
+ std::cout<<__FUNCTION__<<" start ..."<<std::endl;\
+ g_convptr=new ConvolutionLayerTest<CPUDevice<float> >;\
+ g_convptr->SetUp();\
+ g_convptr->Test ## test_case ();\
+ forward_convolution();\
+ delete g_convptr;\
+ std::cout<<__FUNCTION__<<" DONE"<<std::endl;\
+}
+
+DEF_TEST_FUNC(SimpleConvolution)
+DEF_TEST_FUNC(DilatedConvolution)
+DEF_TEST_FUNC(0DConvolution)
+DEF_TEST_FUNC(Simple3DConvolution)
+DEF_TEST_FUNC(Dilated3DConvolution)
+DEF_TEST_FUNC(1x1Convolution)
+DEF_TEST_FUNC(SimpleConvolutionGroup)
+
+
+int main(int argc, char * argv[])
+{
+ caffe::GlobalInit(&argc, &argv);
+
+ init_testbed();
+
+ RUN_FUNC(SimpleConvolution);
+ RUN_FUNC(DilatedConvolution);
+ RUN_FUNC(0DConvolution);
+ RUN_FUNC(Simple3DConvolution);
+ RUN_FUNC(Dilated3DConvolution);
+ RUN_FUNC(1x1Convolution);
+ RUN_FUNC(SimpleConvolutionGroup);
+
+ release_testbed();
+ return 0;
+}
diff --git a/unit_tests/sgemm.cpp b/unit_tests/sgemm.cpp
new file mode 100644
index 00000000..11a80a49
--- /dev/null
+++ b/unit_tests/sgemm.cpp
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <cblas.h>
+
+extern "C" {
+#include "testbed.h"
+}
+
+float * A;
+float * B;
+float * C;
+int M;
+int N;
+int K;
+
+
+void init_matrix(float * m, int size)
+{
+ int i;
+ for(i=0;i<size;i+=2)
+ m[i]=i;
+}
+
+void init_data(void)
+{
+ int sizeA;
+ int sizeB;
+ int sizeC;
+
+ sizeA=M*K;
+ sizeB=K*N;
+ sizeC=M*N;
+
+ A=(float *)malloc(sizeA*sizeof(float));
+ B=(float *)malloc(sizeB*sizeof(float));
+ C=(float *)malloc(sizeC*sizeof(float));
+
+ init_matrix(A,sizeA);
+ init_matrix(B,sizeB);
+ init_matrix(C,sizeC);
+}
+
+
+
+void run_sgemm(void * dummy)
+{
+ int i;
+ for(i=0;i<1;i++)
+ {
+ cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,M,N,K,
+ 1.0,A,K,B,N,0,C,N);
+ }
+
+}
+
+int main(int argc, char * argv[])
+{
+
+ M=27;
+ K=9;
+ N=37632;
+
+
+ init_data();
+ init_testbed();
+
+ run_test(16,1,run_sgemm,NULL);
+
+ release_testbed();
+
+ return 0;
+}
diff --git a/unit_tests/test.cpp b/unit_tests/test.cpp
new file mode 100644
index 00000000..80adcc92
--- /dev/null
+++ b/unit_tests/test.cpp
@@ -0,0 +1,37 @@
+#include "gtest/gtest.h"
+
+
+template <typename TypeParam>
+class foo : public ::testing::Test {
+
+public:
+ foo(){};
+ ~foo(){};
+
+ TypeParam data;
+};
+
+
+typedef ::testing::Types<int,float > TestDtype;
+
+TYPED_TEST_CASE(foo,TestDtype);
+
+TYPED_TEST(foo,test1)
+{
+
+ TypeParam a=10;
+
+ this->data=10;
+
+ EXPECT_EQ(this->data,a);
+
+}
+
+
+int main(int argc, char * argv[])
+{
+ ::testing::InitGoogleTest(&argc, argv);
+
+ return RUN_ALL_TESTS();
+ return 0;
+}
diff --git a/unit_tests/test_caffe_main.cpp b/unit_tests/test_caffe_main.cpp
new file mode 100644
index 00000000..ee0dcd4a
--- /dev/null
+++ b/unit_tests/test_caffe_main.cpp
@@ -0,0 +1,34 @@
+#include "caffe/caffe.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+
+extern "C" {
+#include "testbed.h"
+}
+
+class testbed_env: public ::testing::Environment {
+
+ public:
+ testbed_env(){};
+ ~testbed_env() {};
+
+ void SetUp(void)
+ {
+ std::cout<<"setting up testbed resource"<<std::endl;
+ }
+
+ void TearDown(void)
+ {
+ std::cout<<"release testbed resource"<<std::endl;
+ }
+
+};
+
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ caffe::GlobalInit(&argc, &argv);
+
+ ::testing::AddGlobalTestEnvironment(new testbed_env);
+ // invoke the test.
+ return RUN_ALL_TESTS();
+}
diff --git a/unit_tests/test_common.cpp b/unit_tests/test_common.cpp
new file mode 100644
index 00000000..58ae5c60
--- /dev/null
+++ b/unit_tests/test_common.cpp
@@ -0,0 +1,64 @@
+#include "gtest/gtest.h"
+
+#include "caffe/common.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+class CommonTest : public ::testing::Test {};
+
+#ifndef CPU_ONLY // GPU Caffe singleton test.
+
+TEST_F(CommonTest, TestCublasHandlerGPU) {
+ int cuda_device_id;
+ CUDA_CHECK(cudaGetDevice(&cuda_device_id));
+ EXPECT_TRUE(Caffe::cublas_handle());
+}
+
+#endif
+
+TEST_F(CommonTest, TestBrewMode) {
+ Caffe::set_mode(Caffe::CPU);
+ EXPECT_EQ(Caffe::mode(), Caffe::CPU);
+ Caffe::set_mode(Caffe::GPU);
+ EXPECT_EQ(Caffe::mode(), Caffe::GPU);
+}
+
+TEST_F(CommonTest, TestRandSeedCPU) {
+ SyncedMemory data_a(10 * sizeof(int));
+ SyncedMemory data_b(10 * sizeof(int));
+ Caffe::set_random_seed(1701);
+ caffe_rng_bernoulli(10, 0.5, static_cast<int*>(data_a.mutable_cpu_data()));
+
+ Caffe::set_random_seed(1701);
+ caffe_rng_bernoulli(10, 0.5, static_cast<int*>(data_b.mutable_cpu_data()));
+
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_EQ(static_cast<const int*>(data_a.cpu_data())[i],
+ static_cast<const int*>(data_b.cpu_data())[i]);
+ }
+}
+
+#ifndef CPU_ONLY // GPU Caffe singleton test.
+
+TEST_F(CommonTest, TestRandSeedGPU) {
+ SyncedMemory data_a(10 * sizeof(unsigned int));
+ SyncedMemory data_b(10 * sizeof(unsigned int));
+ Caffe::set_random_seed(1701);
+ CURAND_CHECK(curandGenerate(Caffe::curand_generator(),
+ static_cast<unsigned int*>(data_a.mutable_gpu_data()), 10));
+ Caffe::set_random_seed(1701);
+ CURAND_CHECK(curandGenerate(Caffe::curand_generator(),
+ static_cast<unsigned int*>(data_b.mutable_gpu_data()), 10));
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_EQ(((const unsigned int*)(data_a.cpu_data()))[i],
+ ((const unsigned int*)(data_b.cpu_data()))[i]);
+ }
+}
+
+#endif
+
+} // namespace caffe
diff --git a/unit_tests/test_convolution_layer.cpp b/unit_tests/test_convolution_layer.cpp
new file mode 100644
index 00000000..b2db63ee
--- /dev/null
+++ b/unit_tests/test_convolution_layer.cpp
@@ -0,0 +1,888 @@
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/conv_layer.hpp"
+
+#ifdef USE_CUDNN
+#include "caffe/layers/cudnn_conv_layer.hpp"
+#endif
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+static void dump_blob(const Blob<Dtype> * blob, const char * outfile)
+{
+ std::ofstream os;
+ os.open(outfile);
+
+ for(int i=0;i<blob->LegacyShape(0);i++)
+ {
+ os<<"batch: "<<i<<std::endl;
+
+ for(int j=0;j<blob->LegacyShape(1);j++)
+ {
+ os<<"channel: "<<j<<std::endl;
+
+ for(int k=0;k<blob->LegacyShape(2);k++)
+ {
+ for(int l=0;l<blob->LegacyShape(3);l++)
+ {
+ Dtype data=blob->data_at(i,j,k,l);
+ os<<" "<<data;
+ }
+ os<<std::endl;
+ }
+ os<<std::endl;
+ }
+
+ }
+
+ os.close();
+
+}
+
+
+template <typename Dtype>
+static void fill_blob_data(Blob<Dtype >* bottom, int fixed, float val)
+{
+ for(int i=0;i<bottom->num();i++)
+ for(int j=0;j<bottom->channels();j++)
+ for(int l=0;l<bottom->height();l++)
+ for(int k=0;k<bottom->width();k++)
+ {
+ int offset;
+ Dtype * ptr;
+
+ offset=i*bottom->channels()*bottom->height()*bottom->width()+
+ j*bottom->height()*bottom->width()+
+ l*bottom->width()+k;
+
+ ptr=bottom->mutable_cpu_data();
+
+ if(fixed)
+ ptr[offset]=val;
+ else
+ ptr[offset]=offset;
+
+ }
+
+
+}
+
+
+// Reference convolution for checking results:
+// accumulate through explicit loops over input, output, and filters.
+template <typename Dtype>
+void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
+ const vector<shared_ptr<Blob<Dtype> > >& weights,
+ Blob<Dtype>* out) {
+ const bool has_depth = (out->num_axes() == 5);
+ if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
+ // Kernel size, stride, and pad
+ int kernel_h, kernel_w;
+ if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
+ kernel_h = conv_param->kernel_h();
+ kernel_w = conv_param->kernel_w();
+ } else {
+ kernel_h = kernel_w = conv_param->kernel_size(0);
+ }
+ int pad_h, pad_w;
+ if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
+ pad_h = conv_param->pad_h();
+ pad_w = conv_param->pad_w();
+ } else {
+ pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
+ }
+ int stride_h, stride_w;
+ if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
+ stride_h = conv_param->stride_h();
+ stride_w = conv_param->stride_w();
+ } else {
+ stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
+ }
+ int dilation_h, dilation_w;
+ dilation_h = dilation_w = conv_param->dilation_size() ?
+ conv_param->dilation(0) : 1;
+ int kernel_d, pad_d, stride_d, dilation_d;
+ if (has_depth) {
+ kernel_d = kernel_h;
+ stride_d = stride_h;
+ pad_d = pad_h;
+ dilation_d = dilation_h;
+ } else {
+ kernel_d = stride_d = dilation_d = 1;
+ pad_d = 0;
+ }
+ // Groups
+ int groups = conv_param->group();
+ int o_g = out->shape(1) / groups;
+ int k_g = in->shape(1) / groups;
+ int o_head, k_head;
+ // Convolution
+ vector<int> weight_offset(4 + has_depth);
+ vector<int> in_offset(4 + has_depth);
+ vector<int> out_offset(4 + has_depth);
+ Dtype* out_data = out->mutable_cpu_data();
+ for (int n = 0; n < out->shape(0); n++) {
+ for (int g = 0; g < groups; g++) {
+ o_head = o_g * g;
+ k_head = k_g * g;
+ for (int o = 0; o < o_g; o++) {
+ for (int k = 0; k < k_g; k++) {
+ for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+ for (int y = 0; y < out->shape(2 + has_depth); y++) {
+ for (int x = 0; x < out->shape(3 + has_depth); x++) {
+ for (int r = 0; r < kernel_d; r++) {
+ for (int p = 0; p < kernel_h; p++) {
+ for (int q = 0; q < kernel_w; q++) {
+ int in_z = z * stride_d - pad_d + r * dilation_d;
+ int in_y = y * stride_h - pad_h + p * dilation_h;
+ int in_x = x * stride_w - pad_w + q * dilation_w;
+ if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
+ && in_y >= 0 && in_y < in->shape(2 + has_depth)
+ && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
+ weight_offset[0] = o + o_head;
+ weight_offset[1] = k;
+ if (has_depth) { weight_offset[2] = r; }
+ weight_offset[2 + has_depth] = p;
+ weight_offset[3 + has_depth] = q;
+ in_offset[0] = n;
+ in_offset[1] = k + k_head;
+ if (has_depth) { in_offset[2] = in_z; }
+ in_offset[2 + has_depth] = in_y;
+ in_offset[3 + has_depth] = in_x;
+ out_offset[0] = n;
+ out_offset[1] = o + o_head;
+ if (has_depth) { out_offset[2] = z; }
+ out_offset[2 + has_depth] = y;
+ out_offset[3 + has_depth] = x;
+ out_data[out->offset(out_offset)] +=
+ in->data_at(in_offset)
+ * weights[0]->data_at(weight_offset);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ // Bias
+ if (conv_param->bias_term()) {
+ const Dtype* bias_data = weights[1]->cpu_data();
+ for (int n = 0; n < out->shape(0); n++) {
+ for (int o = 0; o < out->shape(1); o++) {
+ for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+ for (int y = 0; y < out->shape(2 + has_depth); y++) {
+ for (int x = 0; x < out->shape(3 + has_depth); x++) {
+ out_offset[0] = n;
+ out_offset[1] = o;
+ if (has_depth) { out_offset[2] = z; }
+ out_offset[2 + has_depth] = y;
+ out_offset[3 + has_depth] = x;
+ out_data[out->offset(out_offset)] += bias_data[o];
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template void caffe_conv(const Blob<float>* in,
+ ConvolutionParameter* conv_param,
+ const vector<shared_ptr<Blob<float> > >& weights,
+ Blob<float>* out);
+template void caffe_conv(const Blob<double>* in,
+ ConvolutionParameter* conv_param,
+ const vector<shared_ptr<Blob<double> > >& weights,
+ Blob<double>* out);
+
+template <typename TypeParam>
+class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+ ConvolutionLayerTest()
+ : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
+ blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
+ blob_top_(new Blob<Dtype>()),
+ blob_top_2_(new Blob<Dtype>()) {}
+ virtual void SetUp() {
+ // fill the values
+ FillerParameter filler_param;
+ filler_param.set_value(1.);
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ filler.Fill(this->blob_bottom_2_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+
+ virtual ~ConvolutionLayerTest() {
+ delete blob_bottom_;
+ delete blob_bottom_2_;
+ delete blob_top_;
+ delete blob_top_2_;
+ }
+
+ virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
+ this->ref_blob_top_.reset(new Blob<Dtype>());
+ this->ref_blob_top_->ReshapeLike(*top);
+ return this->ref_blob_top_.get();
+ }
+
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_bottom_2_;
+ Blob<Dtype>* const blob_top_;
+ Blob<Dtype>* const blob_top_2_;
+ shared_ptr<Blob<Dtype> > ref_blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+
+#define TestDtypesAndDevices float_only
+TYPED_TEST_CASE(ConvolutionLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ConvolutionLayerTest, TestSetup) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(4);
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->channels(), 4);
+ EXPECT_EQ(this->blob_top_->height(), 2);
+ EXPECT_EQ(this->blob_top_->width(), 1);
+ EXPECT_EQ(this->blob_top_2_->num(), 2);
+ EXPECT_EQ(this->blob_top_2_->channels(), 4);
+ EXPECT_EQ(this->blob_top_2_->height(), 2);
+ EXPECT_EQ(this->blob_top_2_->width(), 1);
+ // setting group should not change the shape
+ convolution_param->set_num_output(3);
+ convolution_param->set_group(3);
+ layer.reset(new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->channels(), 3);
+ EXPECT_EQ(this->blob_top_->height(), 2);
+ EXPECT_EQ(this->blob_top_->width(), 1);
+ EXPECT_EQ(this->blob_top_2_->num(), 2);
+ EXPECT_EQ(this->blob_top_2_->channels(), 3);
+ EXPECT_EQ(this->blob_top_2_->height(), 2);
+ EXPECT_EQ(this->blob_top_2_->width(), 1);
+}
+
+
+TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+
+
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(3);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+
+ vector<int> bottom_shape;
+ bottom_shape.push_back(2);
+ bottom_shape.push_back(3);
+ bottom_shape.push_back(5);
+ bottom_shape.push_back(5);
+
+ this->blob_bottom_->Reshape(bottom_shape);
+ this->blob_bottom_2_->Reshape(bottom_shape);
+
+ fill_blob_data(this->blob_bottom_,0,1);
+ fill_blob_data(this->blob_bottom_2_,1,1);
+
+ layer_param.set_type("Convolution");
+
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ //fill_blob_data(layer->blobs()[0].get(),1,1);
+
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+#ifdef LAYER_PERF_STAT
+ perf_stat * p_time_stat;
+ p_time_stat=layer->get_time_stat();
+ std::cout<<"start: "<<p_time_stat->start;
+ std::cout<<" end: "<<p_time_stat->end;
+ std::cout<<" used: "<<p_time_stat->used;
+ std::cout<<" total: "<<p_time_stat->total;
+ std::cout<<" count: "<<p_time_stat->count<<std::endl;
+#endif
+
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+
+
+// dump_blob(this->blob_bottom_,"bottom.data");
+// dump_blob(this->blob_top_,"top.data");
+// dump_blob(this->ref_blob_top_.get(),"reftop.data");
+// dump_blob(layer->blobs()[0].get(),"weight.data");
+// dump_blob(layer->blobs()[1].get(),"bias.data");
+#if 1
+ caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_2_));
+ top_data = this->blob_top_2_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+#endif
+}
+
+#if 0
+
+TYPED_TEST(ConvolutionLayerTest, TestDilatedConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ vector<int> bottom_shape;
+ bottom_shape.push_back(2);
+ bottom_shape.push_back(3);
+ bottom_shape.push_back(8);
+ bottom_shape.push_back(7);
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_dilation(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+ caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_2_));
+ top_data = this->blob_top_2_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+}
+
+TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ const int kNumOutput = 3;
+ convolution_param->set_num_output(kNumOutput);
+ convolution_param->set_axis(3);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("gaussian");
+
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ vector<int> top_shape = this->blob_bottom_->shape();
+ top_shape[3] = kNumOutput;
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(top_shape, this->blob_top_->shape());
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ vector<int> weight_offset(2);
+ const Blob<Dtype>* weight = layer->blobs()[0].get();
+ const Blob<Dtype>* bias = layer->blobs()[1].get();
+ const int num = this->blob_top_->count(3);
+ const int dim = this->blob_top_->shape(3);
+ const int bottom_dim = this->blob_bottom_->shape(3);
+ for (int n = 0; n < num; ++n) {
+ for (int d = 0; d < dim; ++d) {
+ weight_offset[0] = d;
+ Dtype value = bias->cpu_data()[d];
+ for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) {
+ weight_offset[1] = bottom_d;
+ value += weight->data_at(weight_offset) *
+ this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d];
+ }
+ EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4);
+ }
+ }
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ vector<int> bottom_shape(5);
+ bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+ bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+ bottom_shape[2] = 5;
+ bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
+ bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ filler.Fill(this->blob_bottom_vec_[i]);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("gaussian");
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+ caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_2_));
+ top_data = this->blob_top_2_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestDilated3DConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+ vector<int> bottom_shape(5);
+ bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+ bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+ bottom_shape[2] = 6;
+ bottom_shape[3] = 7;
+ bottom_shape[4] = 8;
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ filler.Fill(this->blob_bottom_vec_[i]);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_dilation(2);
+ convolution_param->set_num_output(4);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("gaussian");
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+ caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_2_));
+ top_data = this->blob_top_2_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+}
+
+#endif
+
+TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+#if 0
+ convolution_param->add_kernel_size(1);
+ convolution_param->set_num_output(2);
+
+ vector<int> bottom_shape;
+ bottom_shape.push_back(1);
+ bottom_shape.push_back(32);
+ bottom_shape.push_back(133);
+ bottom_shape.push_back(98);
+
+ this->blob_bottom_vec_[0]->Reshape(bottom_shape);
+
+#else
+ convolution_param->add_kernel_size(1);
+ convolution_param->add_stride(1);
+ convolution_param->set_num_output(4);
+#endif
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(1);
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ fill_blob_data(this->blob_bottom_,1,1);
+ fill_blob_data(layer->blobs()[0].get(),1,1);
+
+
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ fill_blob_data(this->blob_bottom_,1,3);
+
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ // std::cout<<i<<": "<< top_data[i]<<" "<<ref_top_data[i]<<std::endl;
+ }
+}
+
+
+#if 0
+TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(3);
+ convolution_param->set_group(3);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
+ // Test separable convolution by computing the Sobel operator
+ // as a single filter then comparing the result
+ // as the convolution of two rectangular filters.
+ typedef typename TypeParam::Dtype Dtype;
+ // Fill bottoms with identical Gaussian noise.
+ shared_ptr<GaussianFiller<Dtype> > filler;
+ FillerParameter filler_param;
+ filler_param.set_value(1.);
+ filler.reset(new GaussianFiller<Dtype>(filler_param));
+ filler->Fill(this->blob_bottom_);
+ this->blob_bottom_2_->CopyFrom(*this->blob_bottom_);
+ // Compute Sobel G_x operator as 3 x 3 convolution.
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(1);
+ convolution_param->set_bias_term(false);
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+
+ layer->blobs().resize(1);
+ layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 3));
+ Dtype* weights = layer->blobs()[0]->mutable_cpu_data();
+ for (int c = 0; c < 3; ++c) {
+ int i = c * 9; // 3 x 3 filter
+ weights[i + 0] = -1;
+ weights[i + 1] = 0;
+ weights[i + 2] = 1;
+ weights[i + 3] = -2;
+ weights[i + 4] = 0;
+ weights[i + 5] = 2;
+ weights[i + 6] = -1;
+ weights[i + 7] = 0;
+ weights[i + 8] = 1;
+ }
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions.
+ // (1) the [1 2 1] column filter
+ vector<Blob<Dtype>*> sep_blob_bottom_vec;
+ vector<Blob<Dtype>*> sep_blob_top_vec;
+ shared_ptr<Blob<Dtype> > blob_sep(new Blob<Dtype>());
+ sep_blob_bottom_vec.push_back(this->blob_bottom_2_);
+ sep_blob_top_vec.push_back(this->blob_top_2_);
+ convolution_param->clear_kernel_size();
+ convolution_param->clear_stride();
+ convolution_param->set_kernel_h(3);
+ convolution_param->set_kernel_w(1);
+ convolution_param->set_stride_h(2);
+ convolution_param->set_stride_w(1);
+ convolution_param->set_num_output(1);
+ convolution_param->set_bias_term(false);
+ layer.reset(new ConvolutionLayer<Dtype>(layer_param));
+ layer->blobs().resize(1);
+ layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 1));
+ Dtype* weights_1 = layer->blobs()[0]->mutable_cpu_data();
+ for (int c = 0; c < 3; ++c) {
+ int i = c * 3; // 3 x 1 filter
+ weights_1[i + 0] = 1;
+ weights_1[i + 1] = 2;
+ weights_1[i + 2] = 1;
+ }
+ layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
+ layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
+ // (2) the [-1 0 1] row filter
+ blob_sep->CopyFrom(*this->blob_top_2_, false, true);
+ sep_blob_bottom_vec.clear();
+ sep_blob_bottom_vec.push_back(blob_sep.get());
+ convolution_param->set_kernel_h(1);
+ convolution_param->set_kernel_w(3);
+ convolution_param->set_stride_h(1);
+ convolution_param->set_stride_w(2);
+ convolution_param->set_num_output(1);
+ convolution_param->set_bias_term(false);
+ layer.reset(new ConvolutionLayer<Dtype>(layer_param));
+ layer->blobs().resize(1);
+ layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3));
+ Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data();
+ weights_2[0] = -1;
+ weights_2[1] = 0;
+ weights_2[2] = 1;
+ layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
+ layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
+ // Test equivalence of full and separable filters.
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ const Dtype* sep_top_data = this->blob_top_2_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], sep_top_data[i], 1e-4);
+ }
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
+ typedef typename TypeParam::Dtype Dtype;
+ const int kernel_h = 11;
+ const int kernel_w = 13;
+ vector<int> bottom_shape(4);
+ bottom_shape[0] = 15;
+ bottom_shape[1] = 18;
+ bottom_shape[2] = kernel_h * 2;
+ bottom_shape[3] = kernel_w * 2;
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+ this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+ filler.Fill(this->blob_bottom_vec_[i]);
+ }
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->set_num_output(12);
+ convolution_param->set_bias_term(false);
+ convolution_param->set_group(6);
+ convolution_param->set_kernel_h(kernel_h);
+ convolution_param->set_kernel_w(kernel_w);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ Blob<Dtype> weights;
+ Blob<Dtype> top_diff;
+ // Shape and fill weights and top_diff.
+ bool copy_diff;
+ bool reshape;
+ {
+ ConvolutionLayer<Dtype> layer(layer_param);
+ layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ top_diff.ReshapeLike(*this->blob_top_);
+ filler.Fill(&top_diff);
+ ASSERT_EQ(1, layer.blobs().size());
+ copy_diff = false; reshape = true;
+ weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
+ }
+ vector<bool> propagate_down(1, true);
+ Blob<Dtype> result_2d;
+ Blob<Dtype> backward_result_2d;
+ Blob<Dtype> backward_weight_result_2d;
+ // Test with 2D im2col
+ {
+ caffe_set(this->blob_top_->count(), Dtype(0),
+ this->blob_top_->mutable_cpu_data());
+ caffe_set(this->blob_bottom_->count(), Dtype(0),
+ this->blob_bottom_->mutable_cpu_diff());
+ caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+ // Do SetUp and Forward; save Forward result in result_2d.
+ convolution_param->set_force_nd_im2col(false);
+
+ layer_param.set_type("Convolution");
+
+ Layer<Dtype> & layer_2d=*LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+
+ layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ ASSERT_EQ(1, layer_2d.blobs().size());
+ copy_diff = false; reshape = false;
+ layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+ layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ copy_diff = false; reshape = true;
+ result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
+ // Copy pre-generated top diff into actual top diff;
+ // do Backward and save result in backward_result_2d.
+ ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+ caffe_copy(top_diff.count(), top_diff.cpu_data(),
+ this->blob_top_->mutable_cpu_diff());
+ layer_2d.Backward(this->blob_top_vec_, propagate_down,
+ this->blob_bottom_vec_);
+ copy_diff = true; reshape = true;
+ backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+ backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
+ }
+ Blob<Dtype> result_nd;
+ Blob<Dtype> backward_result_nd;
+ Blob<Dtype> backward_weight_result_nd;
+ // Test with ND im2col
+ {
+ caffe_set(this->blob_top_->count(), Dtype(0),
+ this->blob_top_->mutable_cpu_data());
+ caffe_set(this->blob_bottom_->count(), Dtype(0),
+ this->blob_bottom_->mutable_cpu_diff());
+ caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+ // Do SetUp and Forward; save Forward result in result_nd.
+ convolution_param->set_force_nd_im2col(true);
+
+ layer_param.set_type("Convolution");
+
+ Layer<Dtype>& layer_nd=*LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ ASSERT_EQ(1, layer_nd.blobs().size());
+ copy_diff = false; reshape = false;
+ layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+ layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ copy_diff = false; reshape = true;
+ result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
+ // Copy pre-generated top diff into actual top diff;
+ // do Backward and save result in backward_result_nd.
+ ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+ caffe_copy(top_diff.count(), top_diff.cpu_data(),
+ this->blob_top_->mutable_cpu_diff());
+ layer_nd.Backward(this->blob_top_vec_, propagate_down,
+ this->blob_bottom_vec_);
+ copy_diff = true; reshape = true;
+ backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+ backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
+ }
+ ASSERT_EQ(result_nd.count(), result_2d.count());
+ for (int i = 0; i < result_2d.count(); ++i) {
+ EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
+ }
+ ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
+ for (int i = 0; i < backward_result_2d.count(); ++i) {
+ EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+ backward_result_nd.cpu_diff()[i]);
+ }
+ ASSERT_EQ(backward_weight_result_nd.count(),
+ backward_weight_result_2d.count());
+ for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
+ EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
+ backward_weight_result_nd.cpu_diff()[i]);
+ }
+}
+#endif
+
+
+} // namespace caffe
diff --git a/unit_tests/test_fail.cpp b/unit_tests/test_fail.cpp
new file mode 100644
index 00000000..0bc54954
--- /dev/null
+++ b/unit_tests/test_fail.cpp
@@ -0,0 +1,419 @@
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/conv_layer.hpp"
+
+#ifdef USE_CUDNN
+#include "caffe/layers/cudnn_conv_layer.hpp"
+#endif
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include <iomanip>
+
+namespace caffe {
+
+template <typename Dtype>
+void dump_blob(const Blob<Dtype> * blob, const char * outfile)
+{
+ std::ofstream os;
+ os.open(outfile);
+
+ os<<setiosflags(ios::fixed);
+
+ for(int i=0;i<blob->LegacyShape(0);i++)
+ {
+
+ for(int j=0;j<blob->LegacyShape(1);j++)
+ {
+
+ for(int k=0;k<blob->LegacyShape(2);k++)
+ {
+ for(int l=0;l<blob->LegacyShape(3);l++)
+ {
+ Dtype data=blob->data_at(i,j,k,l);
+ os<<std::setprecision(12)<<data<<", ";
+ }
+ os<<std::endl;
+ }
+ os<<std::endl;
+ }
+ os<<std::endl;
+ }
+
+ os.close();
+
+}
+
+
+template <typename Dtype>
+void fill_blob_data(Blob<Dtype >* bottom, int fixed, float val)
+{
+ for(int i=0;i<bottom->num();i++)
+ for(int j=0;j<bottom->channels();j++)
+ for(int l=0;l<bottom->height();l++)
+ for(int k=0;k<bottom->width();k++)
+ {
+ int offset;
+ Dtype * ptr;
+
+ offset=i*bottom->channels()*bottom->height()*bottom->width()+
+ j*bottom->height()*bottom->width()+
+ l*bottom->width()+k;
+
+ ptr=bottom->mutable_cpu_data();
+
+ if(fixed)
+ ptr[offset]=val;
+ else
+ ptr[offset]=offset+100;
+
+ }
+
+
+}
+
+
+template <typename Dtype>
+void load_blob_data(Blob<Dtype >* bottom, Dtype * p_data)
+{
+ for(int i=0;i<bottom->num();i++)
+ for(int j=0;j<bottom->channels();j++)
+ for(int l=0;l<bottom->height();l++)
+ for(int k=0;k<bottom->width();k++)
+ {
+ int offset;
+ Dtype * ptr;
+
+ offset=i*bottom->channels()*bottom->height()*bottom->width()+
+ j*bottom->height()*bottom->width()+
+ l*bottom->width()+k;
+
+ ptr=bottom->mutable_cpu_data();
+
+ ptr[offset]=p_data[offset];
+
+ }
+
+}
+
+
+
+// Reference convolution for checking results:
+// accumulate through explicit loops over input, output, and filters.
+template <typename Dtype>
+void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
+ const vector<shared_ptr<Blob<Dtype> > >& weights,
+ Blob<Dtype>* out) {
+ const bool has_depth = (out->num_axes() == 5);
+ if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
+ // Kernel size, stride, and pad
+ int kernel_h, kernel_w;
+ if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
+ kernel_h = conv_param->kernel_h();
+ kernel_w = conv_param->kernel_w();
+ } else {
+ kernel_h = kernel_w = conv_param->kernel_size(0);
+ }
+ int pad_h, pad_w;
+ if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
+ pad_h = conv_param->pad_h();
+ pad_w = conv_param->pad_w();
+ } else {
+ pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
+ }
+ int stride_h, stride_w;
+ if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
+ stride_h = conv_param->stride_h();
+ stride_w = conv_param->stride_w();
+ } else {
+ stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
+ }
+ int dilation_h, dilation_w;
+ dilation_h = dilation_w = conv_param->dilation_size() ?
+ conv_param->dilation(0) : 1;
+ int kernel_d, pad_d, stride_d, dilation_d;
+ if (has_depth) {
+ kernel_d = kernel_h;
+ stride_d = stride_h;
+ pad_d = pad_h;
+ dilation_d = dilation_h;
+ } else {
+ kernel_d = stride_d = dilation_d = 1;
+ pad_d = 0;
+ }
+ // Groups
+ int groups = conv_param->group();
+ int o_g = out->shape(1) / groups;
+ int k_g = in->shape(1) / groups;
+ int o_head, k_head;
+ // Convolution
+ vector<int> weight_offset(4 + has_depth);
+ vector<int> in_offset(4 + has_depth);
+ vector<int> out_offset(4 + has_depth);
+ Dtype* out_data = out->mutable_cpu_data();
+ for (int n = 0; n < out->shape(0); n++) {
+ for (int g = 0; g < groups; g++) {
+ o_head = o_g * g;
+ k_head = k_g * g;
+ for (int o = 0; o < o_g; o++) {
+ for (int k = 0; k < k_g; k++) {
+ for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+ for (int y = 0; y < out->shape(2 + has_depth); y++) {
+ for (int x = 0; x < out->shape(3 + has_depth); x++) {
+ for (int r = 0; r < kernel_d; r++) {
+ for (int p = 0; p < kernel_h; p++) {
+ for (int q = 0; q < kernel_w; q++) {
+ int in_z = z * stride_d - pad_d + r * dilation_d;
+ int in_y = y * stride_h - pad_h + p * dilation_h;
+ int in_x = x * stride_w - pad_w + q * dilation_w;
+ if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
+ && in_y >= 0 && in_y < in->shape(2 + has_depth)
+ && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
+ weight_offset[0] = o + o_head;
+ weight_offset[1] = k;
+ if (has_depth) { weight_offset[2] = r; }
+ weight_offset[2 + has_depth] = p;
+ weight_offset[3 + has_depth] = q;
+ in_offset[0] = n;
+ in_offset[1] = k + k_head;
+ if (has_depth) { in_offset[2] = in_z; }
+ in_offset[2 + has_depth] = in_y;
+ in_offset[3 + has_depth] = in_x;
+ out_offset[0] = n;
+ out_offset[1] = o + o_head;
+ if (has_depth) { out_offset[2] = z; }
+ out_offset[2 + has_depth] = y;
+ out_offset[3 + has_depth] = x;
+ out_data[out->offset(out_offset)] +=
+ in->data_at(in_offset)
+ * weights[0]->data_at(weight_offset);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ // Bias
+ if (conv_param->bias_term()) {
+ const Dtype* bias_data = weights[1]->cpu_data();
+ for (int n = 0; n < out->shape(0); n++) {
+ for (int o = 0; o < out->shape(1); o++) {
+ for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+ for (int y = 0; y < out->shape(2 + has_depth); y++) {
+ for (int x = 0; x < out->shape(3 + has_depth); x++) {
+ out_offset[0] = n;
+ out_offset[1] = o;
+ if (has_depth) { out_offset[2] = z; }
+ out_offset[2 + has_depth] = y;
+ out_offset[3 + has_depth] = x;
+ out_data[out->offset(out_offset)] += bias_data[o];
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template void caffe_conv(const Blob<float>* in,
+ ConvolutionParameter* conv_param,
+ const vector<shared_ptr<Blob<float> > >& weights,
+ Blob<float>* out);
+template void caffe_conv(const Blob<double>* in,
+ ConvolutionParameter* conv_param,
+ const vector<shared_ptr<Blob<double> > >& weights,
+ Blob<double>* out);
+
+template <typename TypeParam>
+class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+ ConvolutionLayerTest()
+ : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
+ blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
+ blob_top_(new Blob<Dtype>()),
+ blob_top_2_(new Blob<Dtype>()) {}
+ virtual void SetUp() {
+ // fill the values
+ FillerParameter filler_param;
+ filler_param.set_value(1.);
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ filler.Fill(this->blob_bottom_2_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+
+ virtual ~ConvolutionLayerTest() {
+ delete blob_bottom_;
+ delete blob_bottom_2_;
+ delete blob_top_;
+ delete blob_top_2_;
+ }
+
+ virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
+ this->ref_blob_top_.reset(new Blob<Dtype>());
+ this->ref_blob_top_->ReshapeLike(*top);
+ return this->ref_blob_top_.get();
+ }
+
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_bottom_2_;
+ Blob<Dtype>* const blob_top_;
+ Blob<Dtype>* const blob_top_2_;
+ shared_ptr<Blob<Dtype> > ref_blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+
+#define TestDtypesAndDevices float_only
+TYPED_TEST_CASE(ConvolutionLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ConvolutionLayerTest, TestSetup) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(2);
+ convolution_param->set_num_output(4);
+ this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+ this->blob_top_vec_.push_back(this->blob_top_2_);
+
+ layer_param.set_type("Convolution");
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->channels(), 4);
+ EXPECT_EQ(this->blob_top_->height(), 2);
+ EXPECT_EQ(this->blob_top_->width(), 1);
+ EXPECT_EQ(this->blob_top_2_->num(), 2);
+ EXPECT_EQ(this->blob_top_2_->channels(), 4);
+ EXPECT_EQ(this->blob_top_2_->height(), 2);
+ EXPECT_EQ(this->blob_top_2_->width(), 1);
+ // setting group should not change the shape
+ convolution_param->set_num_output(3);
+ convolution_param->set_group(3);
+ layer.reset(new ConvolutionLayer<Dtype>(layer_param));
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->channels(), 3);
+ EXPECT_EQ(this->blob_top_->height(), 2);
+ EXPECT_EQ(this->blob_top_->width(), 1);
+ EXPECT_EQ(this->blob_top_2_->num(), 2);
+ EXPECT_EQ(this->blob_top_2_->channels(), 3);
+ EXPECT_EQ(this->blob_top_2_->height(), 2);
+ EXPECT_EQ(this->blob_top_2_->width(), 1);
+}
+
+float fail3_weight[]={
+-0.850632905960, -1.578843951225, -0.890021681786,
+0.971448659897, -0.538104891777, 0.233876436949,
+-1.242745161057, 2.211859703064, 0.525026142597,
+
+-1.726792931557, -1.194667577744, 1.119420289993,
+-1.539444208145, 1.725312829018, -1.573384165764,
+0.519557833672, 0.376551657915, -0.615215837955,
+
+0.758795797825, -0.498177528381, 0.254181325436,
+-0.071698464453, -1.192728281021, 0.776199519634,
+1.837580919266, -0.478745609522, -0.804457962513,
+
+
+-2.220808744431, -0.892578184605, -1.422935843468,
+-1.707052111626, -1.837757468224, -1.312300324440,
+-1.251585721970, -1.591378808022, -0.577652215958,
+
+1.727164268494, 0.176050186157, -1.804216146469,
+0.547152698040, -0.024264926091, -2.040683984756,
+-2.159983396530, 1.692966818810, -1.558626413345,
+
+-1.242013096809, 0.122898645699, -0.146973758936,
+-0.405744194984, -1.716119289398, 1.215066313744,
+1.061164021492, -0.705341339111, -0.245370775461,
+
+
+0.781007647514, -0.104610890150, 2.421228170395,
+0.348720043898, 0.289468020201, 1.841132760048,
+-0.835199236870, -0.242239400744, 1.169079542160,
+
+0.165550187230, -0.418082803488, 0.479667782784,
+-0.241552516818, 0.767971694469, -0.760977804661,
+-2.419095993042, 0.774254024029, 0.541432976723,
+
+0.855292022228, -0.144438281655, 0.251998007298,
+-0.242634430528, -0.044748753309, -0.321820944548,
+-0.487676948309, -0.761075556278, -0.646164357662
+};
+
+TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
+ typedef typename TypeParam::Dtype Dtype;
+
+ LayerParameter layer_param;
+ ConvolutionParameter* convolution_param =
+ layer_param.mutable_convolution_param();
+ convolution_param->add_kernel_size(3);
+ convolution_param->add_stride(1);
+ convolution_param->set_num_output(3);
+ convolution_param->mutable_weight_filler()->set_type("gaussian");
+ convolution_param->mutable_bias_filler()->set_type("constant");
+ convolution_param->mutable_bias_filler()->set_value(0.1);
+
+ vector<int> bottom_shape;
+ bottom_shape.push_back(1);
+ bottom_shape.push_back(3);
+ bottom_shape.push_back(5);
+ bottom_shape.push_back(5);
+
+ this->blob_bottom_->Reshape(bottom_shape);
+
+ fill_blob_data(this->blob_bottom_,0,1);
+
+ layer_param.set_type("Convolution");
+
+ shared_ptr<Layer<Dtype> > layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ //fill_blob_data(layer->blobs()[0].get(),1,1);
+ load_blob_data(layer->blobs()[0].get(),fail3_weight);
+
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Check against reference convolution.
+ const Dtype* top_data;
+ const Dtype* ref_top_data;
+ caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+ this->MakeReferenceTop(this->blob_top_));
+ top_data = this->blob_top_->cpu_data();
+ ref_top_data = this->ref_blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_top_->count(); ++i) {
+ EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+ }
+
+
+ dump_blob(this->blob_bottom_,"bottom.data");
+ dump_blob(this->blob_top_,"top.data");
+ dump_blob(this->ref_blob_top_.get(),"reftop.data");
+ dump_blob(layer->blobs()[0].get(),"weight.data");
+ dump_blob(layer->blobs()[1].get(),"bias.data");
+}
+
+}
diff --git a/unit_tests/test_inner_product_layer.cpp b/unit_tests/test_inner_product_layer.cpp
new file mode 100644
index 00000000..efe3d87f
--- /dev/null
+++ b/unit_tests/test_inner_product_layer.cpp
@@ -0,0 +1,295 @@
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/inner_product_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+#ifndef CPU_ONLY
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+#endif
+
+template <typename TypeParam>
+class InnerProductLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+ protected:
+ InnerProductLayerTest()
+ : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+ blob_bottom_nobatch_(new Blob<Dtype>(1, 2, 3, 4)),
+ blob_top_(new Blob<Dtype>()) {
+ // fill the values
+ FillerParameter filler_param;
+ UniformFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+ virtual ~InnerProductLayerTest() {
+ delete blob_bottom_;
+ delete blob_bottom_nobatch_;
+ delete blob_top_;
+ }
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_bottom_nobatch_;
+ Blob<Dtype>* const blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+
+#define TestDtypesAndDevices float_only
+
+TYPED_TEST_CASE(InnerProductLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(InnerProductLayerTest, TestSetUp) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_);
+ LayerParameter layer_param;
+ InnerProductParameter* inner_product_param =
+ layer_param.mutable_inner_product_param();
+ inner_product_param->set_num_output(10);
+
+ layer_param.set_type("InnerProduct");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<InnerProductLayer<Dtype> > layer=
+ boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer);
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->height(), 1);
+ EXPECT_EQ(this->blob_top_->width(), 1);
+ EXPECT_EQ(this->blob_top_->channels(), 10);
+}
+
+/** @brief TestSetUp while toggling transpose flag
+ */
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeFalse) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_);
+ LayerParameter layer_param;
+ InnerProductParameter* inner_product_param =
+ layer_param.mutable_inner_product_param();
+ inner_product_param->set_num_output(10);
+ inner_product_param->set_transpose(false);
+
+ layer_param.set_type("InnerProduct");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<InnerProductLayer<Dtype> > layer=
+ boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer);
+
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(2, this->blob_top_->num());
+ EXPECT_EQ(1, this->blob_top_->height());
+ EXPECT_EQ(1, this->blob_top_->width());
+ EXPECT_EQ(10, this->blob_top_->channels());
+ EXPECT_EQ(2, layer->blobs()[0]->num_axes());
+ EXPECT_EQ(10, layer->blobs()[0]->shape(0));
+ EXPECT_EQ(60, layer->blobs()[0]->shape(1));
+}
+
+/** @brief TestSetUp while toggling transpose flag
+ */
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeTrue) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_);
+ LayerParameter layer_param;
+ InnerProductParameter* inner_product_param =
+ layer_param.mutable_inner_product_param();
+ inner_product_param->set_num_output(10);
+ inner_product_param->set_transpose(true);
+
+ layer_param.set_type("InnerProduct");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<InnerProductLayer<Dtype> > layer=
+ boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(2, this->blob_top_->num());
+ EXPECT_EQ(1, this->blob_top_->height());
+ EXPECT_EQ(1, this->blob_top_->width());
+ EXPECT_EQ(10, this->blob_top_->channels());
+ EXPECT_EQ(2, layer->blobs()[0]->num_axes());
+ EXPECT_EQ(60, layer->blobs()[0]->shape(0));
+ EXPECT_EQ(10, layer->blobs()[0]->shape(1));
+}
+
+TYPED_TEST(InnerProductLayerTest, TestForward) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_);
+ bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+ IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+ if (Caffe::mode() == Caffe::CPU ||
+ sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+ LayerParameter layer_param;
+ InnerProductParameter* inner_product_param =
+ layer_param.mutable_inner_product_param();
+ inner_product_param->set_num_output(10);
+ inner_product_param->mutable_weight_filler()->set_type("uniform");
+ inner_product_param->mutable_bias_filler()->set_type("uniform");
+ inner_product_param->mutable_bias_filler()->set_min(1);
+ inner_product_param->mutable_bias_filler()->set_max(2);
+
+ layer_param.set_type("InnerProduct");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<InnerProductLayer<Dtype> > layer=
+ boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ const Dtype* data = this->blob_top_->cpu_data();
+ const int count = this->blob_top_->count();
+ for (int i = 0; i < count; ++i) {
+ EXPECT_GE(data[i], 1.);
+ }
+ } else {
+ LOG(ERROR) << "Skipping test due to old architecture.";
+ }
+}
+
+/**
+ * @brief Init. an IP layer without transpose + random weights,
+ * run Forward, save the result.
+ * Init. another IP layer with transpose.
+ * manually copy and transpose the weights from the first IP layer,
+ * then run Forward on the same input and check that the result is the same
+ */
+TYPED_TEST(InnerProductLayerTest, TestForwardTranspose) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_);
+ bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+ IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+ if (Caffe::mode() == Caffe::CPU ||
+ sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+ LayerParameter layer_param;
+ InnerProductParameter* inner_product_param =
+ layer_param.mutable_inner_product_param();
+ inner_product_param->set_num_output(10);
+ inner_product_param->mutable_weight_filler()->set_type("uniform");
+ inner_product_param->mutable_bias_filler()->set_type("uniform");
+ inner_product_param->mutable_bias_filler()->set_min(1);
+ inner_product_param->mutable_bias_filler()->set_max(2);
+ inner_product_param->set_transpose(false);
+
+ layer_param.set_type("InnerProduct");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<InnerProductLayer<Dtype> > layer=
+ boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ const int count = this->blob_top_->count();
+ Blob<Dtype>* const top = new Blob<Dtype>();
+ top->ReshapeLike(*this->blob_top_);
+ caffe_copy(count, this->blob_top_->cpu_data(), top->mutable_cpu_data());
+ this->blob_top_vec_.clear();
+ this->blob_top_vec_.push_back(new Blob<Dtype>());
+ inner_product_param->set_transpose(true);
+ shared_ptr<InnerProductLayer<Dtype> > ip_t(
+ new InnerProductLayer<Dtype>(layer_param));
+ ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ const int count_w = layer->blobs()[0]->count();
+ EXPECT_EQ(count_w, ip_t->blobs()[0]->count());
+ // manually copy and transpose the weights from 1st IP layer into 2nd
+ const Dtype* w = layer->blobs()[0]->cpu_data();
+ Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data();
+ const int width = layer->blobs()[0]->shape(1);
+ const int width_t = ip_t->blobs()[0]->shape(1);
+ for (int i = 0; i < count_w; ++i) {
+ int r = i / width;
+ int c = i % width;
+ w_t[c*width_t+r] = w[r*width+c]; // copy while transposing
+ }
+ // copy bias from 1st IP layer to 2nd IP layer
+ ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count());
+ caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(),
+ ip_t->blobs()[1]->mutable_cpu_data());
+ ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(count, this->blob_top_->count())
+ << "Invalid count for top blob for IP with transpose.";
+ Blob<Dtype>* const top_t = new Blob<Dtype>();\
+ top_t->ReshapeLike(*this->blob_top_vec_[0]);
+ caffe_copy(count,
+ this->blob_top_vec_[0]->cpu_data(),
+ top_t->mutable_cpu_data());
+ const Dtype* data = top->cpu_data();
+ const Dtype* data_t = top_t->cpu_data();
+ for (int i = 0; i < count; ++i) {
+ EXPECT_FLOAT_EQ(data[i], data_t[i]);
+ }
+ } else {
+ LOG(ERROR) << "Skipping test due to old architecture.";
+ }
+}
+
+TYPED_TEST(InnerProductLayerTest, TestForwardNoBatch) {
+ typedef typename TypeParam::Dtype Dtype;
+ this->blob_bottom_vec_.push_back(this->blob_bottom_nobatch_);
+ bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+ IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+ if (Caffe::mode() == Caffe::CPU ||
+ sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+ LayerParameter layer_param;
+ InnerProductParameter* inner_product_param =
+ layer_param.mutable_inner_product_param();
+ inner_product_param->set_num_output(10);
+ inner_product_param->mutable_weight_filler()->set_type("uniform");
+ inner_product_param->mutable_bias_filler()->set_type("uniform");
+ inner_product_param->mutable_bias_filler()->set_min(1);
+ inner_product_param->mutable_bias_filler()->set_max(2);
+
+ layer_param.set_type("InnerProduct");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<InnerProductLayer<Dtype> > layer=
+ boost::static_pointer_cast<InnerProductLayer<Dtype> > (new_layer);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ const Dtype* data = this->blob_top_->cpu_data();
+ const int count = this->blob_top_->count();
+ for (int i = 0; i < count; ++i) {
+ EXPECT_GE(data[i], 1.);
+ }
+ } else {
+ LOG(ERROR) << "Skipping test due to old architecture.";
+ }
+}
+
+
+} // namespace caffe
diff --git a/unit_tests/test_lrn_layer.cpp b/unit_tests/test_lrn_layer.cpp
new file mode 100644
index 00000000..22ae8c64
--- /dev/null
+++ b/unit_tests/test_lrn_layer.cpp
@@ -0,0 +1,344 @@
+#include <algorithm>
+#include <vector>
+#include <iostream>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/lrn_layer.hpp"
+
+#ifdef USE_CUDNN
+#include "caffe/layers/cudnn_lcn_layer.hpp"
+#include "caffe/layers/cudnn_lrn_layer.hpp"
+#endif
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+using std::min;
+using std::max;
+
+int test_h=5;
+int test_w=5;
+
+namespace caffe {
+
+
+template <typename Dtype>
+static void dump_blob(const Blob<Dtype> * blob, const char * outfile)
+{
+ std::ofstream os;
+ os.open(outfile);
+
+ for(int i=0;i<blob->shape(0);i++)
+ for(int j=0;j<blob->shape(1);j++)
+ for(int k=0;k<blob->shape(2);k++)
+ for(int l=0;l<blob->shape(3);l++)
+ {
+ Dtype data=blob->data_at(i,j,k,l);
+
+ os<<data<<std::endl;
+ }
+
+ os.close();
+
+}
+
+template <typename Dtype>
+static void fill_blob_data(Blob<Dtype >* bottom, int fixed, float val)
+{
+ for(int i=0;i<bottom->num();i++)
+ for(int j=0;j<bottom->channels();j++)
+ for(int l=0;l<bottom->height();l++)
+ for(int k=0;k<bottom->width();k++)
+ {
+ int offset;
+ Dtype * ptr;
+
+ offset=i*bottom->channels()*bottom->height()*bottom->width()+
+ j*bottom->height()*bottom->width()+
+ l*bottom->width()+k;
+
+ ptr=bottom->mutable_cpu_data();
+
+ if(fixed)
+ ptr[offset]=val;
+ else
+ ptr[offset]=offset;
+
+ }
+
+
+}
+
+
+template <typename TypeParam>
+class LRNLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+ LRNLayerTest()
+ : epsilon_(Dtype(1e-5)),
+ blob_bottom_(new Blob<Dtype>()),
+ blob_top_(new Blob<Dtype>()) {}
+ virtual void SetUp() {
+ Caffe::set_random_seed(1701);
+ blob_bottom_->Reshape(2, 7, test_h,test_w);
+ // fill the values
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+ virtual ~LRNLayerTest() { delete blob_bottom_; delete blob_top_; }
+ void ReferenceLRNForward(const Blob<Dtype>& blob_bottom,
+ const LayerParameter& layer_param, Blob<Dtype>* blob_top);
+
+ Dtype epsilon_;
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+template <typename TypeParam>
+void LRNLayerTest<TypeParam>::ReferenceLRNForward(
+ const Blob<Dtype>& blob_bottom, const LayerParameter& layer_param,
+ Blob<Dtype>* blob_top) {
+ typedef typename TypeParam::Dtype Dtype;
+ blob_top->Reshape(blob_bottom.num(), blob_bottom.channels(),
+ blob_bottom.height(), blob_bottom.width());
+ Dtype* top_data = blob_top->mutable_cpu_data();
+ LRNParameter lrn_param = layer_param.lrn_param();
+ Dtype alpha = lrn_param.alpha();
+ Dtype beta = lrn_param.beta();
+ int size = lrn_param.local_size();
+ switch (lrn_param.norm_region()) {
+ case LRNParameter_NormRegion_ACROSS_CHANNELS:
+ for (int n = 0; n < blob_bottom.num(); ++n) {
+ for (int c = 0; c < blob_bottom.channels(); ++c) {
+ for (int h = 0; h < blob_bottom.height(); ++h) {
+ for (int w = 0; w < blob_bottom.width(); ++w) {
+ int c_start = c - (size - 1) / 2;
+ int c_end = min(c_start + size, blob_bottom.channels());
+ c_start = max(c_start, 0);
+ Dtype scale = 1.;
+ for (int i = c_start; i < c_end; ++i) {
+ Dtype value = blob_bottom.data_at(n, i, h, w);
+ scale += value * value * alpha / size;
+ }
+ *(top_data + blob_top->offset(n, c, h, w)) =
+ blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
+ }
+ }
+ }
+ }
+ break;
+ case LRNParameter_NormRegion_WITHIN_CHANNEL:
+ for (int n = 0; n < blob_bottom.num(); ++n) {
+ for (int c = 0; c < blob_bottom.channels(); ++c) {
+ for (int h = 0; h < blob_bottom.height(); ++h) {
+ int h_start = h - (size - 1) / 2;
+ int h_end = min(h_start + size, blob_bottom.height());
+ h_start = max(h_start, 0);
+ for (int w = 0; w < blob_bottom.width(); ++w) {
+ Dtype scale = 1.;
+ int w_start = w - (size - 1) / 2;
+ int w_end = min(w_start + size, blob_bottom.width());
+ w_start = max(w_start, 0);
+
+// std::cout<<"h,w ("<<h<<","<<w<<"): ";
+// std::cout<<"box: ( h "<<h_start<<","<<h_end<<")";
+// std::cout<<" (w "<<w_start<<","<<w_end<<")"<<std::endl;
+
+ for (int nh = h_start; nh < h_end; ++nh) {
+ for (int nw = w_start; nw < w_end; ++nw) {
+ Dtype value = blob_bottom.data_at(n, c, nh, nw);
+ scale += value * value * alpha / (size * size);
+ }
+ }
+ *(top_data + blob_top->offset(n, c, h, w)) =
+ blob_bottom.data_at(n, c, h, w) / pow(scale, beta);
+ }
+ }
+ }
+ }
+ break;
+ default:
+ LOG(FATAL) << "Unknown normalization region.";
+ }
+}
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+
+#define TestDtypesAndDevices float_only
+
+TYPED_TEST_CASE(LRNLayerTest, TestDtypesAndDevices);
+
+#if 1
+TYPED_TEST(LRNLayerTest, TestSetupAcrossChannels) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ LRNLayer<Dtype> layer(layer_param);
+ layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->channels(), 7);
+ EXPECT_EQ(this->blob_top_->height(), test_h);
+ EXPECT_EQ(this->blob_top_->width(), test_w);
+}
+
+TYPED_TEST(LRNLayerTest, TestForwardAcrossChannels) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+// LRNLayer<Dtype> layer(layer_param);
+
+ layer_param.mutable_lrn_param()->set_local_size(3);
+
+ layer_param.set_type("LRN");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<LRNLayer<Dtype> > layer=
+ boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer);
+
+ vector<int> bottom_shape;
+ bottom_shape.push_back(1);
+ bottom_shape.push_back(5);
+ bottom_shape.push_back(5);
+ bottom_shape.push_back(5);
+
+
+ this->blob_bottom_vec_[0]->Reshape(bottom_shape);
+
+ fill_blob_data(this->blob_bottom_,1,1);
+
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+
+ Blob<Dtype> top_reference;
+ this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+ &top_reference);
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
+ this->epsilon_);
+ }
+}
+
+
+
+TYPED_TEST(LRNLayerTest, TestForwardAcrossChannelsLargeRegion) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ layer_param.mutable_lrn_param()->set_local_size(15);
+
+
+ layer_param.set_type("LRN");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<LRNLayer<Dtype> > layer=
+ boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ Blob<Dtype> top_reference;
+ this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+ &top_reference);
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
+ this->epsilon_);
+ }
+}
+
+
+TYPED_TEST(LRNLayerTest, TestSetupWithinChannel) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ layer_param.mutable_lrn_param()->set_norm_region(
+ LRNParameter_NormRegion_WITHIN_CHANNEL);
+ layer_param.mutable_lrn_param()->set_local_size(3);
+
+
+ layer_param.set_type("LRN");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<LRNLayer<Dtype> > layer=
+ boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ EXPECT_EQ(this->blob_top_->num(), 2);
+ EXPECT_EQ(this->blob_top_->channels(), 7);
+ EXPECT_EQ(this->blob_top_->height(), test_h);
+ EXPECT_EQ(this->blob_top_->width(), test_w);
+}
+#endif
+
+#if 1
+
+TYPED_TEST(LRNLayerTest, TestForwardWithinChannel) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ layer_param.mutable_lrn_param()->set_norm_region(
+ LRNParameter_NormRegion_WITHIN_CHANNEL);
+ layer_param.mutable_lrn_param()->set_local_size(3);
+// layer_param.mutable_lrn_param()->set_beta(1);
+
+
+ layer_param.set_type("LRN");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<LRNLayer<Dtype> > layer=
+ boost::static_pointer_cast<LRNLayer<Dtype> > (new_layer);
+
+/* presetting bottom_vec and data */
+
+ vector<int> bottom_shape;
+ bottom_shape.push_back(1);
+ bottom_shape.push_back(1);
+ bottom_shape.push_back(5);
+ bottom_shape.push_back(5);
+
+
+ this->blob_bottom_vec_[0]->Reshape(bottom_shape);
+
+ fill_blob_data(this->blob_bottom_,1,1);
+
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+
+ Blob<Dtype> top_reference;
+ this->ReferenceLRNForward(*(this->blob_bottom_), layer_param,
+ &top_reference);
+// for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+// EXPECT_NEAR(this->blob_top_->cpu_data()[i], top_reference.cpu_data()[i],
+// this->epsilon_);
+// }
+
+ dump_blob(this->blob_bottom_,"lrn.bottom.data");
+ dump_blob(this->blob_top_,"lrn.top.data");
+ dump_blob(&top_reference,"lrn.reftop.data");
+
+}
+
+#endif
+
+
+} // namespace caffe
diff --git a/unit_tests/test_neuron_layer.cpp b/unit_tests/test_neuron_layer.cpp
new file mode 100644
index 00000000..10c101da
--- /dev/null
+++ b/unit_tests/test_neuron_layer.cpp
@@ -0,0 +1,358 @@
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+
+#include "caffe/layers/absval_layer.hpp"
+#include "caffe/layers/bnll_layer.hpp"
+#include "caffe/layers/dropout_layer.hpp"
+#include "caffe/layers/elu_layer.hpp"
+#include "caffe/layers/exp_layer.hpp"
+#include "caffe/layers/inner_product_layer.hpp"
+#include "caffe/layers/log_layer.hpp"
+#include "caffe/layers/power_layer.hpp"
+#include "caffe/layers/prelu_layer.hpp"
+#include "caffe/layers/relu_layer.hpp"
+#include "caffe/layers/sigmoid_layer.hpp"
+#include "caffe/layers/tanh_layer.hpp"
+#include "caffe/layers/threshold_layer.hpp"
+
+#ifdef USE_CUDNN
+#include "caffe/layers/cudnn_relu_layer.hpp"
+#include "caffe/layers/cudnn_sigmoid_layer.hpp"
+#include "caffe/layers/cudnn_tanh_layer.hpp"
+#endif
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+#define TestDtypesAndDevices float_only
+
+
+#define SET_LAYER(name) \
+ layer_param.set_type(#name);\
+ shared_ptr<Layer<Dtype> > new_layer=\
+ LayerRegistry<Dtype>::CreateLayer(layer_param);\
+ shared_ptr< name ## Layer <Dtype> > layer= \
+ boost::static_pointer_cast< name ## Layer <Dtype> > (new_layer);\
+ if(0) layer=shared_ptr<name ## Layer<Dtype> >(new name ## Layer<Dtype>(layer_param));\
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+
+template <typename TypeParam>
+class NeuronLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+ NeuronLayerTest()
+ : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+ blob_top_(new Blob<Dtype>()) {
+ Caffe::set_random_seed(1701);
+ // fill the values
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+ virtual ~NeuronLayerTest() { delete blob_bottom_; delete blob_top_; }
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+
+
+ void TestPReLU(PReLULayer<Dtype> *layer) {
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Now, check values
+ const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ const Dtype* slope_data = layer->blobs()[0]->cpu_data();
+ int hw = this->blob_bottom_->height() * this->blob_bottom_->width();
+ int channels = this->blob_bottom_->channels();
+ bool channel_shared = layer->layer_param().prelu_param().channel_shared();
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ int c = channel_shared ? 0 : (i / hw) % channels;
+ EXPECT_EQ(top_data[i],
+ std::max(bottom_data[i], (Dtype)(0))
+ + slope_data[c] * std::min(bottom_data[i], (Dtype)(0)));
+ }
+ }
+
+};
+
+TYPED_TEST_CASE(NeuronLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(NeuronLayerTest, TestAbsVal) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+
+ SET_LAYER(AbsVal);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ const int count = this->blob_bottom_->count();
+ for (int i = 0; i < count; ++i) {
+ EXPECT_EQ(top_data[i], fabs(bottom_data[i]));
+ }
+}
+
+
+TYPED_TEST(NeuronLayerTest, TestReLU) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+
+
+ SET_LAYER(ReLU);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ // Now, check values
+ const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ EXPECT_GE(top_data[i], 0.);
+ EXPECT_TRUE(top_data[i] == 0 || top_data[i] == bottom_data[i]);
+ }
+}
+
+#if 1
+
+TYPED_TEST(NeuronLayerTest, TestReLUWithNegativeSlope) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ CHECK(google::protobuf::TextFormat::ParseFromString(
+ "relu_param { negative_slope: 0.01 }", &layer_param));
+
+ SET_LAYER(ReLU);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ // Now, check values
+ const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ if (top_data[i] >= 0) {
+ EXPECT_FLOAT_EQ(top_data[i], bottom_data[i]);
+ } else {
+ EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] * 0.01);
+ }
+ }
+}
+
+
+TYPED_TEST(NeuronLayerTest, TestSigmoid) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+
+ SET_LAYER(Sigmoid);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ // Now, check values
+ const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ EXPECT_FLOAT_EQ(top_data[i], 1. / (1 + exp(-bottom_data[i])));
+ // check that we squashed the value between 0 and 1
+ EXPECT_GE(top_data[i], 0.);
+ EXPECT_LE(top_data[i], 1.);
+ }
+}
+
+TYPED_TEST(NeuronLayerTest, TestTanH) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+
+ int number=10;
+
+ this->blob_bottom_->Reshape(1,2,number,2);
+
+ for(int i=0;i<number;i++)
+ this->blob_bottom_->mutable_cpu_data()[i]=i*10;
+
+ SET_LAYER(TanH);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ // Test exact values
+ for (int i = 0; i < this->blob_bottom_->num(); ++i) {
+ for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
+ for (int k = 0; k < this->blob_bottom_->height(); ++k) {
+ for (int l = 0; l < this->blob_bottom_->width(); ++l) {
+
+ EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4,
+ (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) /
+ (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1));
+ EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4,
+ (exp(2*this->blob_bottom_->data_at(i, j, k, l)) - 1) /
+ (exp(2*this->blob_bottom_->data_at(i, j, k, l)) + 1));
+ }
+ }
+ }
+ }
+}
+
+
+TYPED_TEST(NeuronLayerTest, TestBNLL) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+
+ SET_LAYER(BNLL);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ // Now, check values
+ const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+ const Dtype* top_data = this->blob_top_->cpu_data();
+ for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+ Dtype target=log(1+exp(bottom_data[i]));
+ EXPECT_NEAR(top_data[i], target,1e-4);
+ }
+}
+#endif
+
+#if 0 /* Not try PReLU now */
+
+TYPED_TEST(NeuronLayerTest, TestPReLUParam) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PReLULayer<Dtype> layer(layer_param);
+ layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ const Dtype* slopes = layer.blobs()[0]->cpu_data();
+ int count = layer.blobs()[0]->count();
+ for (int i = 0; i < count; ++i, ++slopes) {
+ EXPECT_EQ(*slopes, 0.25);
+ }
+}
+
+TYPED_TEST(NeuronLayerTest, TestPReLUForward) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PReLULayer<Dtype> layer(layer_param);
+ layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(layer.blobs()[0].get());
+ this->TestPReLU(&layer);
+}
+
+TYPED_TEST(NeuronLayerTest, TestPReLUForwardChannelShared) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ layer_param.mutable_prelu_param()->set_channel_shared(true);
+ PReLULayer<Dtype> layer(layer_param);
+ layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ this->TestPReLU(&layer);
+}
+
+
+TYPED_TEST(NeuronLayerTest, TestPReLUConsistencyReLU) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter prelu_layer_param;
+ LayerParameter relu_layer_param;
+ relu_layer_param.mutable_relu_param()->set_negative_slope(0.25);
+ PReLULayer<Dtype> prelu(prelu_layer_param);
+ ReLULayer<Dtype> relu(relu_layer_param);
+ // Set up blobs
+ vector<Blob<Dtype>*> blob_bottom_vec_2;
+ vector<Blob<Dtype>*> blob_top_vec_2;
+ shared_ptr<Blob<Dtype> > blob_bottom_2(new Blob<Dtype>());
+ shared_ptr<Blob<Dtype> > blob_top_2(new Blob<Dtype>());
+ blob_bottom_vec_2.push_back(blob_bottom_2.get());
+ blob_top_vec_2.push_back(blob_top_2.get());
+ blob_bottom_2->CopyFrom(*this->blob_bottom_, false, true);
+ // SetUp layers
+ prelu.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ relu.SetUp(blob_bottom_vec_2, blob_top_vec_2);
+ // Check forward
+ prelu.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ relu.Forward(this->blob_bottom_vec_, blob_top_vec_2);
+ for (int s = 0; s < blob_top_2->count(); ++s) {
+ EXPECT_EQ(this->blob_top_->cpu_data()[s], blob_top_2->cpu_data()[s]);
+ }
+ // Check backward
+}
+
+TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
+ typedef typename TypeParam::Dtype Dtype;
+ // Set layer parameters
+ LayerParameter ip_layer_param;
+ LayerParameter prelu_layer_param;
+ InnerProductParameter *ip_param =
+ ip_layer_param.mutable_inner_product_param();
+ ip_param->mutable_weight_filler()->set_type("gaussian");
+ ip_param->set_num_output(3);
+ InnerProductLayer<Dtype> ip(ip_layer_param);
+ PReLULayer<Dtype> prelu(prelu_layer_param);
+ InnerProductLayer<Dtype> ip2(ip_layer_param);
+ PReLULayer<Dtype> prelu2(prelu_layer_param);
+ // Set up blobs
+ vector<Blob<Dtype>*> blob_bottom_vec_2;
+ vector<Blob<Dtype>*> blob_middle_vec_2;
+ vector<Blob<Dtype>*> blob_top_vec_2;
+ shared_ptr<Blob<Dtype> > blob_bottom_2(new Blob<Dtype>());
+ shared_ptr<Blob<Dtype> > blob_middle_2(new Blob<Dtype>());
+ shared_ptr<Blob<Dtype> > blob_top_2(new Blob<Dtype>());
+ blob_bottom_vec_2.push_back(blob_bottom_2.get());
+ blob_middle_vec_2.push_back(blob_middle_2.get());
+ blob_top_vec_2.push_back(blob_top_2.get());
+ blob_bottom_2->CopyFrom(*this->blob_bottom_, false, true);
+ // SetUp layers
+ ip.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ prelu.SetUp(this->blob_top_vec_, this->blob_top_vec_);
+ ip2.SetUp(blob_bottom_vec_2, blob_middle_vec_2);
+ prelu2.SetUp(blob_middle_vec_2, blob_top_vec_2);
+ caffe_copy(ip2.blobs()[0]->count(), ip.blobs()[0]->cpu_data(),
+ ip2.blobs()[0]->mutable_cpu_data());
+ // Forward in-place
+ ip.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ prelu.Forward(this->blob_top_vec_, this->blob_top_vec_);
+ // Forward non-in-place
+ ip2.Forward(blob_bottom_vec_2, blob_middle_vec_2);
+ prelu2.Forward(blob_middle_vec_2, blob_top_vec_2);
+ // Check numbers
+ for (int s = 0; s < blob_top_2->count(); ++s) {
+ EXPECT_EQ(this->blob_top_->cpu_data()[s], blob_top_2->cpu_data()[s]);
+ }
+ // Fill top diff with random numbers
+ shared_ptr<Blob<Dtype> > tmp_blob(new Blob<Dtype>());
+ tmp_blob->ReshapeLike(*blob_top_2.get());
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(tmp_blob.get());
+ caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(),
+ this->blob_top_->mutable_cpu_diff());
+ caffe_copy(blob_top_2->count(), tmp_blob->cpu_data(),
+ blob_top_2->mutable_cpu_diff());
+ // Backward in-place
+ vector<bool> propagate_down;
+ propagate_down.push_back(true);
+ prelu.Backward(this->blob_top_vec_, propagate_down, this->blob_top_vec_);
+ ip.Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+ // Backward non-in-place
+ prelu2.Backward(blob_top_vec_2, propagate_down, blob_middle_vec_2);
+ ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
+ // Check numbers
+ for (int s = 0; s < blob_bottom_2->count(); ++s) {
+ EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+ }
+ for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
+ EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+ }
+ for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
+ EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+ }
+ for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
+ EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
+ prelu2.blobs()[0]->cpu_diff()[s]);
+ }
+}
+
+#endif
+
+} // namespace caffe
diff --git a/unit_tests/test_pooling_layer.cpp b/unit_tests/test_pooling_layer.cpp
new file mode 100644
index 00000000..bde88448
--- /dev/null
+++ b/unit_tests/test_pooling_layer.cpp
@@ -0,0 +1,652 @@
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/pooling_layer.hpp"
+
+#ifdef USE_CUDNN
+#include "caffe/layers/cudnn_pooling_layer.hpp"
+#endif
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+#define TestDtypesAndDevices float_only
+
+
+#define SET_LAYER(name) \
+ layer_param.set_type(#name);\
+ shared_ptr<Layer<Dtype> > new_layer=\
+ LayerRegistry<Dtype>::CreateLayer(layer_param);\
+ shared_ptr< name ## Layer <Dtype> > layer= \
+ boost::static_pointer_cast< name ## Layer <Dtype> > (new_layer);\
+ if(0) layer=shared_ptr<name ## Layer<Dtype> >(new name ## Layer<Dtype>(layer_param));\
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+
+template <typename TypeParam>
+class PoolingLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+ PoolingLayerTest()
+ : blob_bottom_(new Blob<Dtype>()),
+ blob_top_(new Blob<Dtype>()),
+ blob_top_mask_(new Blob<Dtype>()) {}
+ virtual void SetUp() {
+ Caffe::set_random_seed(1701);
+ blob_bottom_->Reshape(2, 3, 6, 5);
+ // fill the values
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+ virtual ~PoolingLayerTest() {
+ delete blob_bottom_;
+ delete blob_top_;
+ delete blob_top_mask_;
+ }
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_top_;
+ Blob<Dtype>* const blob_top_mask_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+ // Test for 2x 2 square pooling layer
+ void TestForwardSquare() {
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_size(2);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+ const int num = 2;
+ const int channels = 2;
+ blob_bottom_->Reshape(num, channels, 3, 5);
+ // Input: 2x 2 channels of:
+ // [1 2 5 2 3]
+ // [9 4 1 4 8]
+ // [1 2 5 2 3]
+ for (int i = 0; i < 15 * num * channels; i += 15) {
+ blob_bottom_->mutable_cpu_data()[i + 0] = 1;
+ blob_bottom_->mutable_cpu_data()[i + 1] = 2;
+ blob_bottom_->mutable_cpu_data()[i + 2] = 5;
+ blob_bottom_->mutable_cpu_data()[i + 3] = 2;
+ blob_bottom_->mutable_cpu_data()[i + 4] = 3;
+ blob_bottom_->mutable_cpu_data()[i + 5] = 9;
+ blob_bottom_->mutable_cpu_data()[i + 6] = 4;
+ blob_bottom_->mutable_cpu_data()[i + 7] = 1;
+ blob_bottom_->mutable_cpu_data()[i + 8] = 4;
+ blob_bottom_->mutable_cpu_data()[i + 9] = 8;
+ blob_bottom_->mutable_cpu_data()[i + 10] = 1;
+ blob_bottom_->mutable_cpu_data()[i + 11] = 2;
+ blob_bottom_->mutable_cpu_data()[i + 12] = 5;
+ blob_bottom_->mutable_cpu_data()[i + 13] = 2;
+ blob_bottom_->mutable_cpu_data()[i + 14] = 3;
+ }
+
+ SET_LAYER(Pooling);
+
+ EXPECT_EQ(blob_top_->num(), num);
+ EXPECT_EQ(blob_top_->channels(), channels);
+ EXPECT_EQ(blob_top_->height(), 2);
+ EXPECT_EQ(blob_top_->width(), 4);
+ if (blob_top_vec_.size() > 1) {
+ EXPECT_EQ(blob_top_mask_->num(), num);
+ EXPECT_EQ(blob_top_mask_->channels(), channels);
+ EXPECT_EQ(blob_top_mask_->height(), 2);
+ EXPECT_EQ(blob_top_mask_->width(), 4);
+ }
+ layer->Forward(blob_bottom_vec_, blob_top_vec_);
+ // Expected output: 2x 2 channels of:
+ // [9 5 5 8]
+ // [9 5 5 8]
+ for (int i = 0; i < 8 * num * channels; i += 8) {
+ EXPECT_EQ(blob_top_->cpu_data()[i + 0], 9);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 1], 5);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 2], 5);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 3], 8);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 4], 9);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 5], 5);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 6], 5);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 7], 8);
+ }
+ if (blob_top_vec_.size() > 1) {
+ // Expected mask output: 2x 2 channels of:
+ // [5 2 2 9]
+ // [5 12 12 9]
+ for (int i = 0; i < 8 * num * channels; i += 8) {
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0], 5);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1], 2);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2], 2);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3], 9);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4], 5);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 12);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 12);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7], 9);
+ }
+ }
+ }
+ // Test for 3x 2 rectangular pooling layer with kernel_h > kernel_w
+ void TestForwardRectHigh() {
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_h(3);
+ pooling_param->set_kernel_w(2);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+ const int num = 2;
+ const int channels = 2;
+ blob_bottom_->Reshape(num, channels, 6, 6);
+ // Input: 2x 2 channels of:
+ // [35 1 6 26 19 24]
+ // [ 3 32 7 21 23 25]
+ // [31 9 2 22 27 20]
+ // [ 8 28 33 17 10 15]
+ // [30 5 34 12 14 16]
+ // [ 4 36 29 13 18 11]
+ // (this is generated by magic(6) in MATLAB)
+ for (int i = 0; i < 36 * num * channels; i += 36) {
+ blob_bottom_->mutable_cpu_data()[i + 0] = 35;
+ blob_bottom_->mutable_cpu_data()[i + 1] = 1;
+ blob_bottom_->mutable_cpu_data()[i + 2] = 6;
+ blob_bottom_->mutable_cpu_data()[i + 3] = 26;
+ blob_bottom_->mutable_cpu_data()[i + 4] = 19;
+ blob_bottom_->mutable_cpu_data()[i + 5] = 24;
+ blob_bottom_->mutable_cpu_data()[i + 6] = 3;
+ blob_bottom_->mutable_cpu_data()[i + 7] = 32;
+ blob_bottom_->mutable_cpu_data()[i + 8] = 7;
+ blob_bottom_->mutable_cpu_data()[i + 9] = 21;
+ blob_bottom_->mutable_cpu_data()[i + 10] = 23;
+ blob_bottom_->mutable_cpu_data()[i + 11] = 25;
+ blob_bottom_->mutable_cpu_data()[i + 12] = 31;
+ blob_bottom_->mutable_cpu_data()[i + 13] = 9;
+ blob_bottom_->mutable_cpu_data()[i + 14] = 2;
+ blob_bottom_->mutable_cpu_data()[i + 15] = 22;
+ blob_bottom_->mutable_cpu_data()[i + 16] = 27;
+ blob_bottom_->mutable_cpu_data()[i + 17] = 20;
+ blob_bottom_->mutable_cpu_data()[i + 18] = 8;
+ blob_bottom_->mutable_cpu_data()[i + 19] = 28;
+ blob_bottom_->mutable_cpu_data()[i + 20] = 33;
+ blob_bottom_->mutable_cpu_data()[i + 21] = 17;
+ blob_bottom_->mutable_cpu_data()[i + 22] = 10;
+ blob_bottom_->mutable_cpu_data()[i + 23] = 15;
+ blob_bottom_->mutable_cpu_data()[i + 24] = 30;
+ blob_bottom_->mutable_cpu_data()[i + 25] = 5;
+ blob_bottom_->mutable_cpu_data()[i + 26] = 34;
+ blob_bottom_->mutable_cpu_data()[i + 27] = 12;
+ blob_bottom_->mutable_cpu_data()[i + 28] = 14;
+ blob_bottom_->mutable_cpu_data()[i + 29] = 16;
+ blob_bottom_->mutable_cpu_data()[i + 30] = 4;
+ blob_bottom_->mutable_cpu_data()[i + 31] = 36;
+ blob_bottom_->mutable_cpu_data()[i + 32] = 29;
+ blob_bottom_->mutable_cpu_data()[i + 33] = 13;
+ blob_bottom_->mutable_cpu_data()[i + 34] = 18;
+ blob_bottom_->mutable_cpu_data()[i + 35] = 11;
+ }
+
+ SET_LAYER(Pooling);
+
+ EXPECT_EQ(blob_top_->channels(), channels);
+ EXPECT_EQ(blob_top_->height(), 4);
+ EXPECT_EQ(blob_top_->width(), 5);
+ if (blob_top_vec_.size() > 1) {
+ EXPECT_EQ(blob_top_mask_->num(), num);
+ EXPECT_EQ(blob_top_mask_->channels(), channels);
+ EXPECT_EQ(blob_top_mask_->height(), 4);
+ EXPECT_EQ(blob_top_mask_->width(), 5);
+ }
+ layer->Forward(blob_bottom_vec_, blob_top_vec_);
+ // Expected output: 2x 2 channels of:
+ // [35 32 26 27 27]
+ // [32 33 33 27 27]
+ // [31 34 34 27 27]
+ // [36 36 34 18 18]
+ for (int i = 0; i < 20 * num * channels; i += 20) {
+ EXPECT_EQ(blob_top_->cpu_data()[i + 0], 35);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 1], 32);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 2], 26);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 3], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 4], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 5], 32);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 6], 33);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 7], 33);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 8], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 9], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 10], 31);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 11], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 13], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 14], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 15], 36);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 17], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 18], 18);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
+ }
+ if (blob_top_vec_.size() > 1) {
+ // [ 1 8 4 17 17]
+ // [ 8 21 21 17 17]
+ // [13 27 27 17 17]
+ // [32 32 27 35 35]
+ for (int i = 0; i < 20 * num * channels; i += 20) {
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0], 0);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1], 7);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2], 3);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 7);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 20);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7], 20);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 8], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 9], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 12);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 31);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 34);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
+ }
+ }
+ }
+ // Test for rectangular pooling layer with kernel_w > kernel_h
+ void TestForwardRectWide() {
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_h(2);
+ pooling_param->set_kernel_w(3);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+ const int num = 2;
+ const int channels = 2;
+ blob_bottom_->Reshape(num, channels, 6, 6);
+ // Input: 2x 2 channels of:
+ // [35 1 6 26 19 24]
+ // [ 3 32 7 21 23 25]
+ // [31 9 2 22 27 20]
+ // [ 8 28 33 17 10 15]
+ // [30 5 34 12 14 16]
+ // [ 4 36 29 13 18 11]
+ // (this is generated by magic(6) in MATLAB)
+ for (int i = 0; i < 36 * num * channels; i += 36) {
+ blob_bottom_->mutable_cpu_data()[i + 0] = 35;
+ blob_bottom_->mutable_cpu_data()[i + 1] = 1;
+ blob_bottom_->mutable_cpu_data()[i + 2] = 6;
+ blob_bottom_->mutable_cpu_data()[i + 3] = 26;
+ blob_bottom_->mutable_cpu_data()[i + 4] = 19;
+ blob_bottom_->mutable_cpu_data()[i + 5] = 24;
+ blob_bottom_->mutable_cpu_data()[i + 6] = 3;
+ blob_bottom_->mutable_cpu_data()[i + 7] = 32;
+ blob_bottom_->mutable_cpu_data()[i + 8] = 7;
+ blob_bottom_->mutable_cpu_data()[i + 9] = 21;
+ blob_bottom_->mutable_cpu_data()[i + 10] = 23;
+ blob_bottom_->mutable_cpu_data()[i + 11] = 25;
+ blob_bottom_->mutable_cpu_data()[i + 12] = 31;
+ blob_bottom_->mutable_cpu_data()[i + 13] = 9;
+ blob_bottom_->mutable_cpu_data()[i + 14] = 2;
+ blob_bottom_->mutable_cpu_data()[i + 15] = 22;
+ blob_bottom_->mutable_cpu_data()[i + 16] = 27;
+ blob_bottom_->mutable_cpu_data()[i + 17] = 20;
+ blob_bottom_->mutable_cpu_data()[i + 18] = 8;
+ blob_bottom_->mutable_cpu_data()[i + 19] = 28;
+ blob_bottom_->mutable_cpu_data()[i + 20] = 33;
+ blob_bottom_->mutable_cpu_data()[i + 21] = 17;
+ blob_bottom_->mutable_cpu_data()[i + 22] = 10;
+ blob_bottom_->mutable_cpu_data()[i + 23] = 15;
+ blob_bottom_->mutable_cpu_data()[i + 24] = 30;
+ blob_bottom_->mutable_cpu_data()[i + 25] = 5;
+ blob_bottom_->mutable_cpu_data()[i + 26] = 34;
+ blob_bottom_->mutable_cpu_data()[i + 27] = 12;
+ blob_bottom_->mutable_cpu_data()[i + 28] = 14;
+ blob_bottom_->mutable_cpu_data()[i + 29] = 16;
+ blob_bottom_->mutable_cpu_data()[i + 30] = 4;
+ blob_bottom_->mutable_cpu_data()[i + 31] = 36;
+ blob_bottom_->mutable_cpu_data()[i + 32] = 29;
+ blob_bottom_->mutable_cpu_data()[i + 33] = 13;
+ blob_bottom_->mutable_cpu_data()[i + 34] = 18;
+ blob_bottom_->mutable_cpu_data()[i + 35] = 11;
+ }
+
+ SET_LAYER(Pooling);
+
+ EXPECT_EQ(blob_top_->num(), num);
+ EXPECT_EQ(blob_top_->channels(), channels);
+ EXPECT_EQ(blob_top_->height(), 5);
+ EXPECT_EQ(blob_top_->width(), 4);
+ if (blob_top_vec_.size() > 1) {
+ EXPECT_EQ(blob_top_mask_->num(), num);
+ EXPECT_EQ(blob_top_mask_->channels(), channels);
+ EXPECT_EQ(blob_top_mask_->height(), 5);
+ EXPECT_EQ(blob_top_mask_->width(), 4);
+ }
+ layer->Forward(blob_bottom_vec_, blob_top_vec_);
+ // Expected output: 2x 2 channels of:
+ // [35 32 26 26]
+ // [32 32 27 27]
+ // [33 33 33 27]
+ // [34 34 34 17]
+ // [36 36 34 18]
+ for (int i = 0; i < 20 * num * channels; i += 20) {
+ EXPECT_EQ(blob_top_->cpu_data()[i + 0], 35);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 1], 32);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 2], 26);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 3], 26);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 4], 32);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 5], 32);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 6], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 7], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 8], 33);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 9], 33);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 10], 33);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 11], 27);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 12], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 13], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 14], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 15], 17);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 16], 36);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 17], 36);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 18], 34);
+ EXPECT_EQ(blob_top_->cpu_data()[i + 19], 18);
+ }
+ if (blob_top_vec_.size() > 1) {
+ // [ 1 8 4 4]
+ // [ 8 8 17 17]
+ // [21 21 21 17]
+ // [27 27 27 22]
+ // [32 32 27 35]
+ for (int i = 0; i < 20 * num * channels; i += 20) {
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 0], 0);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 1], 7);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 2], 3);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 3], 3);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 4], 7);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 5], 7);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 6], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 7], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 8], 20);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 9], 20);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 10], 20);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 11], 16);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 12], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 13], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 14], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 15], 21);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 16], 31);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 17], 31);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 18], 26);
+ EXPECT_EQ(blob_top_mask_->cpu_data()[i + 19], 34);
+ }
+ }
+ }
+};
+
+
+
+
+TYPED_TEST_CASE(PoolingLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(PoolingLayerTest, TestSetup) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_size(3);
+ pooling_param->set_stride(2);
+
+ SET_LAYER(Pooling);
+ //PoolingLayer<Dtype> layer(layer_param);
+ //layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
+ EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
+ EXPECT_EQ(this->blob_top_->height(), 3);
+ EXPECT_EQ(this->blob_top_->width(), 2);
+}
+
+TYPED_TEST(PoolingLayerTest, TestSetupPadded) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_size(3);
+ pooling_param->set_stride(2);
+ pooling_param->set_pad(1);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+// PoolingLayer<Dtype> layer(layer_param);
+// layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ SET_LAYER(Pooling);
+
+ EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
+ EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
+ EXPECT_EQ(this->blob_top_->height(), 4);
+ EXPECT_EQ(this->blob_top_->width(), 3);
+}
+
+TYPED_TEST(PoolingLayerTest, TestSetupGlobalPooling) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_global_pooling(true);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+
+// PoolingLayer<Dtype> layer(layer_param);
+// layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ SET_LAYER(Pooling);
+
+ EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
+ EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_->channels());
+ EXPECT_EQ(this->blob_top_->height(), 1);
+ EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+TYPED_TEST(PoolingLayerTest, TestForwardMax) {
+ this->TestForwardSquare();
+ this->TestForwardRectHigh();
+ this->TestForwardRectWide();
+}
+
+TYPED_TEST(PoolingLayerTest, TestForwardMaxTopMask) {
+ this->blob_top_vec_.push_back(this->blob_top_mask_);
+ this->TestForwardSquare();
+ this->TestForwardRectHigh();
+ this->TestForwardRectWide();
+}
+
+TYPED_TEST(PoolingLayerTest, TestForwardMaxPadded) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_size(3);
+ pooling_param->set_stride(2);
+ pooling_param->set_pad(2);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+ this->blob_bottom_->Reshape(1, 1, 3, 3);
+ // Input:
+ // [ 1 2 4 ]
+ // [ 2 3 2 ]
+ // [ 4 2 1 ]
+ this->blob_bottom_->mutable_cpu_data()[0] = 1;
+ this->blob_bottom_->mutable_cpu_data()[1] = 2;
+ this->blob_bottom_->mutable_cpu_data()[2] = 4;
+ this->blob_bottom_->mutable_cpu_data()[3] = 2;
+ this->blob_bottom_->mutable_cpu_data()[4] = 3;
+ this->blob_bottom_->mutable_cpu_data()[5] = 2;
+ this->blob_bottom_->mutable_cpu_data()[6] = 4;
+ this->blob_bottom_->mutable_cpu_data()[7] = 2;
+ this->blob_bottom_->mutable_cpu_data()[8] = 1;
+
+
+ SET_LAYER(Pooling);
+
+ EXPECT_EQ(this->blob_top_->num(), 1);
+ EXPECT_EQ(this->blob_top_->channels(), 1);
+ EXPECT_EQ(this->blob_top_->height(), 3);
+ EXPECT_EQ(this->blob_top_->width(), 3);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ Dtype epsilon = 1e-8;
+ // Output:
+ // [ 1 4 4 ]
+ // [ 4 4 4 ]
+ // [ 4 4 1 ]
+ EXPECT_NEAR(this->blob_top_->cpu_data()[0], 1, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[2], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[4], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[6], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[8], 1, epsilon);
+}
+
+template <typename Dtype>
+void fill_bottom_data(Blob<Dtype >* bottom)
+{
+ for(int i=0;i<bottom->num();i++)
+ for(int j=0;j<bottom->channels();j++)
+ for(int l=0;l<bottom->height();l++)
+ for(int k=0;k<bottom->width();k++)
+ {
+ int offset;
+ Dtype * ptr;
+
+ offset=i*bottom->channels()*bottom->height()*bottom->width()+
+ j*bottom->height()*bottom->width()+
+ l*bottom->width()+k;
+
+ ptr=bottom->mutable_cpu_data();
+
+ ptr[offset]=offset;
+
+ }
+
+
+}
+
+template <typename Dtype>
+void check_top_data(Blob<Dtype>* bottom, Blob<Dtype>* top, int kernel_size, int stride)
+{
+ Dtype epsilon = 1e-5;
+ int error_count=0;
+
+ for(int i=0;i<top->num();i++)
+ for(int j=0;j<top->channels();j++)
+ for(int l=0;l<top->height();l++)
+ for(int k=0;k<top->width();k++)
+ {
+ Dtype pool_data=top->data_at(i,j,l,k);
+ Dtype max_bottom_data=-100000000;
+
+ int top_h=stride*l;
+ int top_w=stride*k;
+
+ /* calculate max **/
+ for(int x=0;x<kernel_size;x++)
+ for(int y=0;y<kernel_size;y++)
+ {
+ Dtype d=bottom->data_at(i,j,top_h+x,top_w+y);
+
+ if(d>max_bottom_data)
+ max_bottom_data=d;
+ }
+
+ Dtype offset=pool_data-max_bottom_data;
+ if(offset>epsilon || offset<-epsilon)
+
+ {
+ std::cout<<error_count<<": ";
+ std::cout<<i<<" "<<j<<" "<<l<<" "<<k<<" bottom: ";
+ std::cout<<top_h<<" "<<top_w<<std::endl;
+ std::cout<<"pooled: "<<pool_data<<" bottom:"<<max_bottom_data<<std::endl;
+
+ error_count++;
+ }
+
+ if(error_count==10)
+ return ;
+ }
+
+
+
+}
+
+
+TYPED_TEST(PoolingLayerTest, TestMax) {
+ typedef typename TypeParam::Dtype Dtype;
+
+ int kernel_size=3;
+ int stride=2;
+ int channel_number=96;
+ int h=55;
+ int w=55;
+
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_size(kernel_size);
+ pooling_param->set_stride(stride);
+ pooling_param->set_pad(0);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
+ this->blob_bottom_->Reshape(1, channel_number, h, w);
+ FillerParameter filler_param;
+ filler_param.set_value(Dtype(2));
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+
+ SET_LAYER(Pooling);
+
+ //fill_bottom_data(this->blob_bottom_);
+
+ #if 0
+ EXPECT_EQ(this->blob_top_->num(), 1);
+ EXPECT_EQ(this->blob_top_->channels(), 96);
+ EXPECT_EQ(this->blob_top_->height(), 27);
+ EXPECT_EQ(this->blob_top_->width(), 27);
+#endif
+
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+ check_top_data(this->blob_bottom_,this->blob_top_,kernel_size,stride);
+
+}
+
+TYPED_TEST(PoolingLayerTest, TestForwardAve) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+ PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
+ pooling_param->set_kernel_size(3);
+ pooling_param->set_stride(1);
+ pooling_param->set_pad(1);
+ pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
+ this->blob_bottom_->Reshape(1, 1, 3, 3);
+ FillerParameter filler_param;
+ filler_param.set_value(Dtype(2));
+ ConstantFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ PoolingLayer<Dtype> layer(layer_param);
+ layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ EXPECT_EQ(this->blob_top_->num(), 1);
+ EXPECT_EQ(this->blob_top_->channels(), 1);
+ EXPECT_EQ(this->blob_top_->height(), 3);
+ EXPECT_EQ(this->blob_top_->width(), 3);
+ layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+ Dtype epsilon = 1e-5;
+ EXPECT_NEAR(this->blob_top_->cpu_data()[0], 8.0 / 9, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[1], 4.0 / 3, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[2], 8.0 / 9, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[3], 4.0 / 3, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[4], 2.0 , epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[5], 4.0 / 3, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[6], 8.0 / 9, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[7], 4.0 / 3, epsilon);
+ EXPECT_NEAR(this->blob_top_->cpu_data()[8], 8.0 / 9, epsilon);
+}
+
+
+} // namespace caffe
diff --git a/unit_tests/test_softmax_layer.cpp b/unit_tests/test_softmax_layer.cpp
new file mode 100644
index 00000000..ab2f8362
--- /dev/null
+++ b/unit_tests/test_softmax_layer.cpp
@@ -0,0 +1,99 @@
+#include <cmath>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/softmax_layer.hpp"
+
+#ifdef USE_CUDNN
+#include "caffe/layers/cudnn_softmax_layer.hpp"
+#endif
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class SoftmaxLayerTest : public MultiDeviceTest<TypeParam> {
+ typedef typename TypeParam::Dtype Dtype;
+ protected:
+ SoftmaxLayerTest()
+ : blob_bottom_(new Blob<Dtype>(2, 10, 1, 1)),
+ blob_top_(new Blob<Dtype>()) {
+ // fill the values
+ FillerParameter filler_param;
+ GaussianFiller<Dtype> filler(filler_param);
+ filler.Fill(this->blob_bottom_);
+ blob_bottom_vec_.push_back(blob_bottom_);
+ blob_top_vec_.push_back(blob_top_);
+ }
+ virtual ~SoftmaxLayerTest() { delete blob_bottom_; delete blob_top_; }
+ Blob<Dtype>* const blob_bottom_;
+ Blob<Dtype>* const blob_top_;
+ vector<Blob<Dtype>*> blob_bottom_vec_;
+ vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+
+typedef ::testing::Types<CPUDevice<float> > float_only;
+
+#define TestDtypesAndDevices float_only
+
+
+TYPED_TEST_CASE(SoftmaxLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(SoftmaxLayerTest, TestForward) {
+ typedef typename TypeParam::Dtype Dtype;
+ LayerParameter layer_param;
+
+
+
+ layer_param.set_type("Softmax");
+
+ shared_ptr<Layer<Dtype> > new_layer=
+ LayerRegistry<Dtype>::CreateLayer(layer_param);
+
+ shared_ptr<SoftmaxLayer<Dtype> > layer=
+ boost::static_pointer_cast<SoftmaxLayer<Dtype> > (new_layer);
+
+// layer=shared_ptr<SoftmaxLayer<Dtype> >(new SoftmaxLayer<Dtype>(layer_param));
+
+ layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+ layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+
+ // Test sum
+ for (int i = 0; i < this->blob_bottom_->num(); ++i) {
+ for (int k = 0; k < this->blob_bottom_->height(); ++k) {
+ for (int l = 0; l < this->blob_bottom_->width(); ++l) {
+ Dtype sum = 0;
+ for (int j = 0; j < this->blob_top_->channels(); ++j) {
+ sum += this->blob_top_->data_at(i, j, k, l);
+ }
+ EXPECT_GE(sum, 0.999);
+ EXPECT_LE(sum, 1.001);
+ // Test exact values
+ Dtype scale = 0;
+ for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
+ scale += exp(this->blob_bottom_->data_at(i, j, k, l));
+ }
+ for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
+ EXPECT_GE(this->blob_top_->data_at(i, j, k, l) + 1e-4,
+ exp(this->blob_bottom_->data_at(i, j, k, l)) / scale)
+ << "debug: " << i << " " << j;
+ EXPECT_LE(this->blob_top_->data_at(i, j, k, l) - 1e-4,
+ exp(this->blob_bottom_->data_at(i, j, k, l)) / scale)
+ << "debug: " << i << " " << j;
+ }
+ }
+ }
+ }
+}
+
+
+
+} // namespace caffe
diff --git a/unit_tests/testbed.c b/unit_tests/testbed.c
new file mode 100644
index 00000000..d33433d0
--- /dev/null
+++ b/unit_tests/testbed.c
@@ -0,0 +1,146 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+
+
+#include "pmu.h"
+#include "testbed.h"
+
+struct armv8_event
+{
+ char * name;
+ int id;
+ uint32_t init_val;
+ char * note;
+};
+
+
+static struct armv8_event a57_list[6]=
+{
+ {"INST",0x8,0,"instruction retired"},
+ {"CYCL",0x11,0,"CPU running cycle"},
+ {"L1D MISS",0x3,0,"L1D CACHE MISS/REFILL"},
+ {"L1D ACCESS",0x4,0,"L1D CACHE ACCESS"},
+ {"L2 MISS",0x17,0,"L2 CACHE MISS/REFILL"},
+ {"L2 ACCESS",0x16,0,"L2 CACHE ACCESS"}
+};
+
+static int e[6];
+
+void init_testbed(void)
+{
+ int i;
+ struct armv8_event * p_list;
+
+ init_pmu_registers();
+
+ p_list=a57_list;
+
+ for(i=0;i<6;i++)
+ {
+ e[i]=create_pmu_event(p_list[i].name,p_list[i].id,
+ p_list[i].init_val,p_list[i].note);
+ }
+
+}
+
+void run_test(int reptition, int warm_up,void (*test_func)(void *),void * arg)
+{
+ uint32_t t0,t1;
+ uint32_t freq;
+ uint32_t cycle;
+ uint64_t total_time=0;
+ uint32_t loop_count=0;
+ int i;
+
+ if(warm_up)
+ test_func(arg);
+
+
+ freq=read_32bit_sysreg(CNTFRQ_EL0);
+
+ start_pmu_event(e[0]);
+ start_pmu_event(e[1]);
+ start_pmu_event(e[2]);
+ start_pmu_event(e[3]);
+ start_pmu_event(e[4]);
+ start_pmu_event(e[5]);
+
+ set_pmu_event_base(e[0]);
+ set_pmu_event_base(e[1]);
+ set_pmu_event_base(e[2]);
+ set_pmu_event_base(e[3]);
+ set_pmu_event_base(e[4]);
+ set_pmu_event_base(e[5]);
+
+ t0=read_32bit_sysreg(CNTVCT_EL0);
+
+ for(i=0;i<reptition;i++)
+ {
+ test_func(arg);
+
+ record_pmu_event(e[0],0,1,1);
+ record_pmu_event(e[1],0,1,1);
+ record_pmu_event(e[2],0,1,1);
+ record_pmu_event(e[3],0,1,1);
+ record_pmu_event(e[4],0,1,1);
+ record_pmu_event(e[5],0,1,1);
+
+ t1=read_32bit_sysreg(CNTVCT_EL0);
+ loop_count++;
+ total_time+=(t1-t0);
+ t0=t1;
+
+ }
+
+
+ stop_pmu_event(e[0]);
+ stop_pmu_event(e[1]);
+ stop_pmu_event(e[2]);
+ stop_pmu_event(e[3]);
+ stop_pmu_event(e[4]);
+ stop_pmu_event(e[5]);
+
+ dump_pmu_event_stat(e[0]);
+ dump_pmu_event_stat(e[1]);
+ dump_pmu_event_stat(e[2]);
+ dump_pmu_event_stat(e[3]);
+ dump_pmu_event_stat(e[4]);
+ dump_pmu_event_stat(e[5]);
+
+
+ printf("\n------------------------------------\n\n");
+
+
+ cycle=get_pmu_stat_avg(e[1]);
+ t0=total_time/loop_count;
+
+
+ printf("freq is 0x%x\n",freq);
+ printf("pysical counter pass: 0x%x (0x%lx/%u)\n",t0,total_time,loop_count);
+ printf("coverted to ms: %.3f\n",1000.0*t0/freq);
+
+
+ printf("CPU freq: %.2f MHZ (cycle:0x%x)\n",(float)freq*cycle/t0/1000000,cycle);
+
+ printf("IPC is: %.2f \n",(float)get_pmu_stat_avg(e[0])/cycle);
+ printf("L1 CACHE MISS is: %.2f \n",(float)get_pmu_stat_avg(e[2])/get_pmu_stat_avg(e[3]));
+ printf("L2 CACHE MISS is: %.2f \n",(float)get_pmu_stat_avg(e[4])/get_pmu_stat_avg(e[5]));
+
+ /*reset all record */
+
+}
+
+void release_testbed(void)
+{
+
+ release_pmu_event(e[0]);
+ release_pmu_event(e[1]);
+ release_pmu_event(e[2]);
+ release_pmu_event(e[3]);
+ release_pmu_event(e[4]);
+ release_pmu_event(e[5]);
+}
diff --git a/unit_tests/testbed.h b/unit_tests/testbed.h
new file mode 100644
index 00000000..7a7f3df1
--- /dev/null
+++ b/unit_tests/testbed.h
@@ -0,0 +1,10 @@
+#ifndef __TESTBED_H__
+#define __TESTBED_H__
+
+void init_testbed(void);
+
+void run_test(int reptition, int warm_up,void (*test_func)(void *),void * arg);
+
+void release_testbed(void);
+
+#endif