summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMa Mingfei <mingfei.ma@intel.com>2018-03-30 06:25:07 +0800
committerSoumith Chintala <soumith@gmail.com>2018-03-29 15:25:07 -0700
commitf8270c0225e19403038aec2d8af2697a2b5326ec (patch)
tree7dbc688871d3943a2ec3f2d896e8d4e0637d990b
parente4c0bb1809fd9bf9161392bfff7d06092adc224d (diff)
downloadpytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.tar.gz
pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.tar.bz2
pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.zip
Enable MKLDNN convolution forward and backward (#6062)
* Enable MKLDNN convolution forward and backward * minor change * fix mkldnn build error when building ATen standalone
-rw-r--r--aten/CMakeLists.txt14
-rw-r--r--aten/cmake/FindMKLDNN.cmake32
-rw-r--r--aten/src/ATen/CMakeLists.txt12
-rw-r--r--aten/src/ATen/Config.h.in1
-rw-r--r--aten/src/ATen/mkldnn/Runtime.cpp5
-rw-r--r--aten/src/ATen/mkldnn/Runtime.h49
-rw-r--r--aten/src/ATen/native/Convolution.cpp27
-rw-r--r--aten/src/ATen/native/mkldnn/Conv.cpp441
-rw-r--r--aten/src/ATen/native/native_functions.yaml12
-rw-r--r--setup.py19
-rw-r--r--tools/autograd/derivatives.yaml4
-rwxr-xr-xtools/build_pytorch_libs.sh10
-rw-r--r--tools/setup_helpers/mkldnn.py81
13 files changed, 705 insertions, 2 deletions
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 16e7cc6790..13e4abca94 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -460,6 +460,20 @@ ELSE()
set(AT_CUDNN_ENABLED 1)
ENDIF()
+if(NO_MKLDNN)
+ message("disabling MKLDNN because NO_MKLDNN is set")
+ set(AT_MKLDNN_ENABLED 0)
+else()
+ find_package(MKLDNN)
+ if(NOT MKLDNN_FOUND)
+ message(STATUS "MKLDNN not found. Compiling without MKLDNN support")
+ set(AT_MKLDNN_ENABLED 0)
+ else()
+ INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIRS})
+ set(AT_MKLDNN_ENABLED 1)
+ endif()
+endif()
+
if(NO_NNPACK)
message("disabling NNPACK because NO_NNPACK is set")
set(AT_NNPACK_ENABLED 0)
diff --git a/aten/cmake/FindMKLDNN.cmake b/aten/cmake/FindMKLDNN.cmake
new file mode 100644
index 0000000000..0862d5a3ac
--- /dev/null
+++ b/aten/cmake/FindMKLDNN.cmake
@@ -0,0 +1,32 @@
+# - Try to find MKLDNN
+#
+# The following variables are optionally searched for defaults
+# MKLDNN_ROOT_DIR: Base directory where all MKLDNN components are found
+#
+# The following are set after configuration is done:
+# MKLDNN_FOUND
+# MKLDNN_INCLUDE_DIRS
+# MKLDNN_LIBRARIES
+# MKLDNN_LIBRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(MKLDNN_ROOT_DIR "" CACHE PATH "Folder contains Intel MKLDNN")
+
+find_path(MKLDNN_INCLUDE_DIR mkldnn.h
+ HINTS ${MKLDNN_ROOT_DIR}
+ PATH_SUFFIXES include)
+
+find_library(MKLDNN_LIBRARY mkldnn
+ HINTS ${MKLDNN_LIB_DIR} ${MKLDNN_ROOT_DIR}
+ PATH_SUFFIXES lib lib64)
+
+find_package_handle_standard_args(
+ MKLDNN DEFAULT_MSG MKLDNN_INCLUDE_DIR MKLDNN_LIBRARY)
+
+if(MKLDNN_FOUND)
+ set(MKLDNN_INCLUDE_DIRS ${MKLDNN_INCLUDE_DIR})
+ set(MKLDNN_LIBRARIES ${MKLDNN_LIBRARY})
+ message(STATUS "Found MKLDNN (include: ${MKLDNN_INCLUDE_DIR}, library: ${MKLDNN_LIBRARY})")
+ mark_as_advanced(MKLDNN_ROOT_DIR MKLDNN_LIBRARY MKLDNN_INCLUDE_DIR)
+endif()
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index daf986c6bd..27f1d98461 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -149,6 +149,7 @@ FILE(GLOB native_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/*.cpp")
FILE(GLOB native_cudnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/cudnn/*.cpp")
FILE(GLOB native_cuda_cu RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/cuda/*.cu")
FILE(GLOB native_mkl_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/mkl/*.cpp")
+FILE(GLOB native_mkldnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/mkldnn/*.cpp")
FILE(GLOB_RECURSE cuda_h
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -156,6 +157,7 @@ FILE(GLOB_RECURSE cuda_h
FILE(GLOB cudnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "cudnn/*.cpp")
FILE(GLOB mkl_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "mkl/*.cpp")
+FILE(GLOB mkldnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "mkldnn/*.cpp")
FILE(GLOB all_python RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.py")
@@ -201,7 +203,7 @@ ADD_CUSTOM_TARGET(aten_files_are_generated
)
-SET(all_cpp ${base_cpp} ${native_cpp} ${native_cudnn_cpp} ${native_mkl_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+SET(all_cpp ${base_cpp} ${native_cpp} ${native_cudnn_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
INCLUDE_DIRECTORIES(${ATen_CPU_INCLUDE})
IF(NOT NO_CUDA)
@@ -218,6 +220,10 @@ IF(NOT NO_CUDA)
ENDIF()
endif()
+IF(AT_MKLDNN_ENABLED)
+ SET(all_cpp ${all_cpp} ${mkldnn_cpp})
+ENDIF()
+
filter_list(generated_h generated_cpp "\\.h$")
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/..)
@@ -315,6 +321,9 @@ if (NNPACK_FOUND)
target_link_libraries(ATen ${NNPACK_LIBRARIES})
endif(NNPACK_FOUND)
+if(MKLDNN_FOUND)
+ target_link_libraries(ATen ${MKLDNN_LIBRARIES})
+endif(MKLDNN_FOUND)
# ---[ Configure cpuinfo
IF(NOT TARGET cpuinfo)
@@ -326,7 +335,6 @@ IF(NOT TARGET cpuinfo)
ENDIF()
TARGET_LINK_LIBRARIES(ATen cpuinfo)
-
IF(CUDA_FOUND)
TARGET_LINK_LIBRARIES(ATen
${CUDA_LIBRARIES}
diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in
index d62468fd70..1ab0ec9162 100644
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@@ -6,6 +6,7 @@
#define AT_CUDA_ENABLED() @AT_CUDA_ENABLED@
#define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
+#define AT_MKLDNN_ENABLED() @AT_MKLDNN_ENABLED@
#define AT_NNPACK_ENABLED() @AT_NNPACK_ENABLED@
#define AT_MKL_ENABLED() @AT_MKL_ENABLED@
diff --git a/aten/src/ATen/mkldnn/Runtime.cpp b/aten/src/ATen/mkldnn/Runtime.cpp
new file mode 100644
index 0000000000..54f999ed14
--- /dev/null
+++ b/aten/src/ATen/mkldnn/Runtime.cpp
@@ -0,0 +1,5 @@
+#include "Runtime.h"
+
+namespace at { namespace native {
+
+}} // namespace at::native
diff --git a/aten/src/ATen/mkldnn/Runtime.h b/aten/src/ATen/mkldnn/Runtime.h
new file mode 100644
index 0000000000..c58ef2c56f
--- /dev/null
+++ b/aten/src/ATen/mkldnn/Runtime.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <mkldnn.hpp>
+
+using namespace mkldnn;
+
+namespace at { namespace native {
+
+// CpuEngine singleton
+struct CpuEngine {
+ static CpuEngine& Instance() {
+ static CpuEngine myInstance;
+ return myInstance;
+ }
+ engine& get_engine() {
+ return _cpu_engine;
+ }
+ CpuEngine(CpuEngine const&) = delete;
+ CpuEngine& operator=(CpuEngine const&) = delete;
+
+protected:
+ CpuEngine():_cpu_engine(mkldnn::engine::cpu, 0) {}
+ ~CpuEngine() {}
+
+private:
+ engine _cpu_engine;
+};
+
+// Stream singleton
+struct Stream {
+ static Stream& Instance() {
+ static Stream myInstance;
+ return myInstance;
+ };
+ stream& get_stream() {
+ return _cpu_stream;
+ }
+ Stream(Stream const&) = delete;
+ Stream& operator=(Stream const&) = delete;
+
+protected:
+ Stream():_cpu_stream(mkldnn::stream::kind::eager) {}
+ ~Stream() {}
+
+private:
+ stream _cpu_stream;
+};
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index b65759249a..33f7e05d26 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -32,6 +32,7 @@ struct ConvParams {
bool is_padding_neg() const;
void view1d_as_2d();
bool use_cudnn(const at::Tensor& input) const;
+ bool use_mkldnn(const at::Tensor& input) const;
bool use_nnpack(const at::Tensor& input) const;
bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
};
@@ -130,6 +131,17 @@ auto ConvParams::use_cudnn(const at::Tensor& input) const -> bool {
return false;
}
+auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool {
+#if AT_MKLDNN_ENABLED()
+ return input.type().backend() == kCPU &&
+ input.type().scalarType() == kFloat && // only on CPU Float Tensors
+ !is_dilated() && // doesn't support dilation
+ !transposed && // or transposed tensors
+ input.ndimension() == 4 && // must be in NCHW format
+ groups == 1;
+#endif
+ return false;
+}
auto ConvParams::use_nnpack(const at::Tensor& input) const -> bool {
#if AT_NNPACK_ENABLED()
return input.type().backend() == kCPU &&
@@ -371,6 +383,21 @@ at::Tensor _convolution(
params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
}
#endif
+ } else if (params.use_mkldnn(input)) {
+#if AT_MKLDNN_ENABLED()
+ if (input.type() != weight.type()){
+ std::stringstream ss;
+ ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same";
+ throw std::runtime_error(ss.str());
+ }
+ if (bias.defined() && input.type() != bias.type()){
+ std::stringstream ss;
+ ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same";
+ throw std::runtime_error(ss.str());
+ }
+
+ output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation);
+#endif
} else {
if (params.groups == 1) {
output = at::_convolution_nogroup(
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
new file mode 100644
index 0000000000..25cddef9ae
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -0,0 +1,441 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+
+#if !AT_MKLDNN_ENABLED()
+
+namespace at { namespace native {
+
+at::Tensor mkldnn_convolution(
+ const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
+ IntList padding, IntList stride, IntList dilation) {
+ throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
+}
+
+at::Tensor mkldnn_convolution_backward_input(
+ IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+ IntList padding, IntList stride, IntList dilation, bool bias_defined) {
+ throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor> mkldnn_convolution_backward_weights(
+ IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+ IntList padding, IntList stride, IntList dilation, bool bias_defined) {
+ throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
+ const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+ IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) {
+ throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
+}
+
+}}
+
+#else // AT_MKLDNN_EBABLED
+
+#include <ATen/mkldnn/Runtime.h>
+
+using namespace mkldnn;
+
+namespace at { namespace native {
+
+constexpr int input_batch_size_dim = 0; // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0; // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+std::vector<int64_t> conv_output_size(
+ IntList input_size, IntList weight_size,
+ IntList padding, IntList stride, IntList dilation)
+{
+ auto dim = input_size.size();
+ std::vector<int64_t> output_size(dim);
+ output_size[0] = input_size[input_batch_size_dim];
+ output_size[1] = weight_size[weight_output_channels_dim];
+ for (size_t d = 2; d < dim; ++d) {
+ auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+ output_size[d] = (input_size[d] + (2 * padding[d - 2])
+ - kernel) / stride[d - 2] + 1;
+ }
+ return output_size;
+}
+
+at::Tensor mkldnn_convolution(
+ const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
+ IntList padding, IntList stride, IntList dilation)
+{
+ auto output = input.type().tensor(conv_output_size(
+ input.sizes(), weight.sizes(), padding, stride, dilation));
+
+ auto cpu_engine = CpuEngine::Instance().get_engine();
+
+ int32_t n = input.size(0);
+ int32_t ic = input.size(1);
+ int32_t ih = input.size(2);
+ int32_t iw = input.size(3);
+
+ int32_t oc = output.size(1);
+ int32_t oh = output.size(2);
+ int32_t ow = output.size(3);
+
+ int32_t kh = weight.size(2);
+ int32_t kw = weight.size(3);
+
+ int32_t sh = stride[0];
+ int32_t sw = stride[1];
+ int32_t ph = padding[0];
+ int32_t pw = padding[1];
+
+ auto data_t = memory::data_type::f32;
+ auto format_any = memory::format::any;
+ auto format_nchw = memory::format::nchw;
+ auto format_oihw = memory::format::oihw;
+ auto format_x = memory::format::x;
+
+ memory::dims input_tz = {n, ic, ih, iw};
+ memory::dims weight_tz = {oc, ic, kh, kw};
+ memory::dims bias_tz = {oc};
+ memory::dims output_tz = {n, oc, oh, ow};
+ memory::dims _stride = {sh, sw};
+ memory::dims _padding = {ph, pw};
+
+ auto input_md = memory::desc({input_tz}, data_t, format_any);
+ auto weight_md = memory::desc({weight_tz}, data_t, format_any);
+ auto bias_md = memory::desc({bias_tz}, data_t, format_any);
+ auto output_md = memory::desc({output_tz}, data_t, format_any);
+
+ std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+ if (bias.defined()) {
+ conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+ convolution_direct, input_md, weight_md, bias_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ } else {
+ conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+ convolution_direct, input_md, weight_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ }
+
+ std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+ conv_forward_pd.reset(new convolution_forward::primitive_desc(
+ *conv_forward_desc, cpu_engine));
+
+ auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+ input.data_ptr());
+ auto weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+ weight.data_ptr());
+ auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+ output.data_ptr());
+
+ std::vector<primitive> net;
+
+ auto input_pd = conv_forward_pd->src_primitive_desc();
+ auto input_memory = input_usr_memory;
+ if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
+ input_memory = memory(input_pd);
+ net.push_back(reorder(input_usr_memory, input_memory));
+ }
+
+ auto weight_pd = conv_forward_pd->weights_primitive_desc();
+ auto weight_memory = weight_usr_memory;
+ if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
+ weight_memory = memory(weight_pd);
+ net.push_back(reorder(weight_usr_memory, weight_memory));
+ }
+
+ auto output_pd = conv_forward_pd->dst_primitive_desc();
+ auto output_memory = output_usr_memory;
+ if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) {
+ output_memory = memory(output_pd);
+ }
+
+ std::shared_ptr<convolution_forward> conv_forward;
+ std::shared_ptr<memory> bias_usr_memory;
+ if (bias.defined()) {
+ bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
+ bias.data_ptr()));
+ conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
+ weight_memory, *bias_usr_memory, output_memory));
+ } else {
+ conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
+ weight_memory, output_memory));
+ }
+ net.push_back(*conv_forward);
+
+ if (output_memory != output_usr_memory) {
+ net.push_back(reorder(output_memory, output_usr_memory));
+ }
+
+ Stream::Instance().get_stream().submit(net);
+
+ return output;
+}
+
+Tensor mkldnn_convolution_backward_input(
+ IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+ IntList padding, IntList stride, IntList dilation, bool bias_defined)
+{
+ auto grad_input = grad_output.type().tensor(input_size);
+
+ auto cpu_engine = CpuEngine::Instance().get_engine();
+
+ int32_t n = grad_input.size(0);
+ int32_t ic = grad_input.size(1);
+ int32_t ih = grad_input.size(2);
+ int32_t iw = grad_input.size(3);
+
+ int32_t oc = grad_output.size(1);
+ int32_t oh = grad_output.size(2);
+ int32_t ow = grad_output.size(3);
+
+ int32_t kh = weight.size(2);
+ int32_t kw = weight.size(3);
+
+ int32_t sh = stride[0];
+ int32_t sw = stride[1];
+ int32_t ph = padding[0];
+ int32_t pw = padding[1];
+
+ auto data_t = memory::data_type::f32;
+ auto format_any = memory::format::any;
+ auto format_nchw = memory::format::nchw;
+ auto format_oihw = memory::format::oihw;
+
+ memory::dims input_tz = {n, ic, ih, iw};
+ memory::dims weight_tz = {oc, ic, kh, kw};
+ memory::dims bias_tz = {oc};
+ memory::dims output_tz = {n, oc, oh, ow};
+ memory::dims _stride = {sh, sw};
+ memory::dims _padding = {ph, pw};
+
+ auto input_md = memory::desc({input_tz}, data_t, format_any);
+ auto weight_md = memory::desc({weight_tz}, data_t, format_any);
+ auto bias_md = memory::desc({bias_tz}, data_t, format_any);
+ auto output_md = memory::desc({output_tz}, data_t, format_any);
+
+ // need to re-create conv_forward_pd to feed conv_backward_data_pd
+ std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+ if (bias_defined) {
+ conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+ convolution_direct, input_md, weight_md, bias_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ } else {
+ conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+ convolution_direct, input_md, weight_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ }
+
+ std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+ conv_forward_pd.reset(new convolution_forward::primitive_desc(
+ *conv_forward_desc, cpu_engine));
+
+ std::shared_ptr<convolution_backward_data::desc> conv_backward_data_desc;
+ conv_backward_data_desc.reset(new convolution_backward_data::desc(
+ convolution_direct, input_md, weight_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+
+ std::shared_ptr<convolution_backward_data::primitive_desc> conv_backward_data_pd;
+ conv_backward_data_pd.reset(new convolution_backward_data::primitive_desc(
+ *conv_backward_data_desc, cpu_engine, *conv_forward_pd));
+
+ auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+ grad_output.data_ptr());
+ auto weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+ weight.data_ptr());
+ auto grad_input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+ grad_input.data_ptr());
+
+ std::vector<primitive> net;
+
+ auto grad_output_pd = conv_backward_data_pd->diff_dst_primitive_desc();
+ auto grad_output_memory = grad_output_usr_memory;
+ if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
+ grad_output_memory = memory(grad_output_pd);
+ net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
+ }
+
+ auto weight_pd = conv_backward_data_pd->weights_primitive_desc();
+ auto weight_memory = weight_usr_memory;
+ if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
+ weight_memory = memory(weight_pd);
+ net.push_back(reorder(weight_usr_memory, weight_memory));
+ }
+
+ auto grad_input_pd = conv_backward_data_pd->diff_src_primitive_desc();
+ auto grad_input_memory = grad_input_usr_memory;
+ if (grad_input_memory.get_primitive_desc() != memory::primitive_desc(grad_input_pd)) {
+ grad_input_memory = memory(grad_input_pd);
+ }
+
+ std::shared_ptr<convolution_backward_data> conv_backward_data;
+ conv_backward_data.reset(new convolution_backward_data(*conv_backward_data_pd,
+ grad_output_memory, weight_memory, grad_input_memory));
+ net.push_back(*conv_backward_data);
+
+ if (grad_input_memory != grad_input_usr_memory) {
+ net.push_back(reorder(grad_input_memory, grad_input_usr_memory));
+ }
+
+ Stream::Instance().get_stream().submit(net);
+
+ return grad_input;
+}
+
+std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
+ IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+ IntList padding, IntList stride, IntList dilation, bool bias_defined)
+{
+ auto grad_weight = grad_output.type().tensor(weight_size);
+
+ Tensor grad_bias;
+ if (bias_defined) {
+ grad_bias = grad_output.type().tensor({grad_output.size(1)});
+ }
+
+ auto cpu_engine = CpuEngine::Instance().get_engine();
+
+ int32_t n = input.size(0);
+ int32_t ic = input.size(1);
+ int32_t ih = input.size(2);
+ int32_t iw = input.size(3);
+
+ int32_t oc = grad_output.size(1);
+ int32_t oh = grad_output.size(2);
+ int32_t ow = grad_output.size(3);
+
+ int32_t kh = grad_weight.size(2);
+ int32_t kw = grad_weight.size(3);
+
+ int32_t sh = stride[0];
+ int32_t sw = stride[1];
+ int32_t ph = padding[0];
+ int32_t pw = padding[1];
+
+ auto data_t = memory::data_type::f32;
+ auto format_any = memory::format::any;
+ auto format_nchw = memory::format::nchw;
+ auto format_oihw = memory::format::oihw;
+ auto format_x = memory::format::x;
+
+ memory::dims input_tz = {n, ic, ih, iw};
+ memory::dims weight_tz = {oc, ic, kh, kw};
+ memory::dims bias_tz = {oc};
+ memory::dims output_tz = {n, oc, oh, ow};
+ memory::dims _stride = {sh, sw};
+ memory::dims _padding = {ph, pw};
+
+ memory::desc input_md({input_tz}, data_t, format_any);
+ memory::desc weight_md({weight_tz}, data_t, format_any);
+ memory::desc bias_md({bias_tz}, data_t, format_any);
+ memory::desc output_md({output_tz}, data_t, format_any);
+
+ // need to re-create conv_forward_pd to feed conv_backward_weight_pd
+ std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+ if (bias_defined) {
+ conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+ convolution_direct, input_md, weight_md, bias_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ } else {
+ conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+ convolution_direct, input_md, weight_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ }
+
+ std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+ conv_forward_pd.reset(new convolution_forward::primitive_desc(
+ *conv_forward_desc, cpu_engine));
+
+ std::shared_ptr<convolution_backward_weights::desc> conv_backward_weight_desc;
+ if (bias_defined) {
+ conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
+ convolution_direct, input_md, weight_md, bias_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ } else {
+ conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
+ convolution_direct, input_md, weight_md, output_md,
+ _stride, _padding, _padding, padding_kind::zero));
+ }
+
+ std::shared_ptr<convolution_backward_weights::primitive_desc> conv_backward_weight_pd;
+ conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc(
+ *conv_backward_weight_desc, cpu_engine, *conv_forward_pd));
+
+ auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+ input.data_ptr());
+ auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+ grad_output.data_ptr());
+ auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+ grad_weight.data_ptr());
+ std::shared_ptr<memory> grad_bias_memory;
+
+ std::vector<primitive> net;
+
+ auto input_pd = conv_backward_weight_pd->src_primitive_desc();
+ auto input_memory = input_usr_memory;
+ if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
+ input_memory = memory(input_pd);
+ net.push_back(reorder(input_usr_memory, input_memory));
+ }
+
+ auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc();
+ auto grad_output_memory = grad_output_usr_memory;
+ if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
+ grad_output_memory = memory(grad_output_pd);
+ net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
+ }
+
+ auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc();
+ auto grad_weight_memory = grad_weight_usr_memory;
+ if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) {
+ grad_weight_memory = memory(grad_weight_pd);
+ }
+
+ std::shared_ptr<convolution_backward_weights> conv_backward_weight;
+ if (bias_defined) {
+ grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
+ grad_bias.data_ptr()));
+ conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
+ input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory));
+ } else {
+ conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
+ input_memory, grad_output_memory, grad_weight_memory));
+ }
+
+ net.push_back(*conv_backward_weight);
+
+ if (grad_weight_memory != grad_weight_usr_memory) {
+ net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory));
+ }
+
+ Stream::Instance().get_stream().submit(net);
+
+ return std::tuple<at::Tensor, at::Tensor>{grad_weight, grad_bias};
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
+ const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+ IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask)
+{
+ Tensor grad_output = grad_output_t.contiguous();
+
+ Tensor grad_input, grad_weight, grad_bias;
+ if (output_mask[0]) {
+ grad_input = at::mkldnn_convolution_backward_input(
+ input.sizes(), grad_output, weight, padding, stride, dilation, output_mask[2]);
+ }
+ if (output_mask[1] || output_mask[2]) {
+ std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights(
+ weight.sizes(), grad_output, input, padding, stride, dilation, output_mask[2]);
+ }
+
+ return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+}} // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7e665909d7..93fa94eac3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -711,3 +711,15 @@
dispatch:
CPU: _s_poisson_cpu
CUDA: _s_poisson_cuda
+
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation) -> Tensor
+ variants: function
+
+- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, bool bias_defined) -> Tensor
+ variants: function
+
+- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, bool bias_defined) -> (Tensor, Tensor)
+ variants: function
+
+- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+ variants: function
diff --git a/setup.py b/setup.py
index a5bda0f2b2..68f134f62a 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@
# NO_CUDNN
# disables the cuDNN build
#
+# NO_MKLDNN
+# disables the MKLDNN build
+#
# NO_NNPACK
# disables NNPACK build
#
@@ -67,6 +70,11 @@
# NCCL_INCLUDE_DIR
# specify where nccl is installed
#
+# MKLDNN_LIB_DIR
+# MKLDNN_LIBRARY
+# MKLDNN_INCLUDE_DIR
+# specify where MKLDNN is installed
+#
# NVTOOLSEXT_PATH (Windows only)
# specify where nvtoolsext is installed
#
@@ -99,6 +107,8 @@ from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
+from tools.setup_helpers.mkldnn import (WITH_MKLDNN, MKLDNN_LIBRARY,
+ MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR)
from tools.setup_helpers.nnpack import WITH_NNPACK
from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
from tools.setup_helpers.generate_code import generate_code
@@ -214,6 +224,11 @@ def build_libs(libs):
my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
+ if WITH_MKLDNN:
+ my_env["MKLDNN_LIB_DIR"] = MKLDNN_LIB_DIR
+ my_env["MKLDNN_LIBRARY"] = MKLDNN_LIBRARY
+ my_env["MKLDNN_INCLUDE_DIR"] = MKLDNN_INCLUDE_DIR
+ build_libs_cmd += ['--with-mkldnn']
if WITH_GLOO_IBVERBS:
build_libs_cmd += ['--with-gloo-ibverbs']
@@ -397,6 +412,10 @@ class build_ext(build_ext_parent):
print('-- Detected CUDA at ' + CUDA_HOME)
else:
print('-- Not using CUDA')
+ if WITH_MKLDNN:
+ print('-- Detected MKLDNN at ' + MKLDNN_LIBRARY + ', ' + MKLDNN_INCLUDE_DIR)
+ else:
+ print('-- Not using MKLDNN')
if WITH_NCCL and WITH_SYSTEM_NCCL:
print('-- Using system provided NCCL library at ' +
NCCL_SYSTEM_LIB + ', ' + NCCL_INCLUDE_DIR)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 98a46b4446..1cb4afb111 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1126,3 +1126,7 @@
- name: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, Tensor dropout_state)
input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
+
+# mkldnn
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList stride, IntList dilation)
+ self, weight, bias: mkldnn_convolution_backward(self, grad, weight, padding, stride, dilation, grad_input_mask)
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index cb403ca287..34f23317d8 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -23,6 +23,12 @@ if [[ "$1" == "--with-nnpack" ]]; then
shift
fi
+WITH_MKLDNN=0
+if [[ "$1" == "--with-mkldnn" ]]; then
+ WITH_MKLDNN=1
+ shift
+fi
+
WITH_GLOO_IBVERBS=0
if [[ "$1" == "--with-gloo-ibverbs" ]]; then
WITH_GLOO_IBVERBS=1
@@ -202,6 +208,10 @@ function build_aten() {
-DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
-DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
-DCUDNN_LIBRARY=$CUDNN_LIBRARY \
+ -DNO_MKLDNN=$((1-$WITH_MKLDNN)) \
+ -DMKLDNN_INCULDE_DIR=$MKLDNN_INCLUDE_DIR \
+ -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
+ -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
-DATEN_NO_CONTRIB=1 \
-DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
diff --git a/tools/setup_helpers/mkldnn.py b/tools/setup_helpers/mkldnn.py
new file mode 100644
index 0000000000..2dd5445278
--- /dev/null
+++ b/tools/setup_helpers/mkldnn.py
@@ -0,0 +1,81 @@
+import platform
+import glob
+import os
+import sys
+
+from itertools import chain
+from .env import check_env_flag
+
+
+def gather_paths(env_vars):
+ return list(chain(*(os.getenv(v, '').split(':') for v in env_vars)))
+
+IS_LINUX = platform.system() == 'Linux'
+IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version
+CONDA_DIR = os.path.join(os.path.dirname(sys.executable), '..')
+
+MKLDNN_HOME = os.getenv('MKLDNN_HOME', '/usr/local/mkl-dnn')
+
+WITH_MKLDNN = False
+MKLDNN_LIB_DIR = None
+MKLDNN_INCLUDE_DIR = None
+MKLDNN_LIBRARY = None
+if IS_LINUX and not check_env_flag('NO_MKLDNN'):
+ lib_paths = list(filter(bool, [
+ os.getenv('MKLDNN_LIB_DIR'),
+ os.path.join(MKLDNN_HOME, 'lib'),
+ os.path.join(MKLDNN_HOME, 'lib64'),
+ '/usr/lib/',
+ '/usr/lib64/',
+ ] + gather_paths([
+ 'LIBRARY_PATH',
+ ]) + gather_paths([
+ 'LD_LIBRARY_PATH',
+ ])))
+ include_paths = list(filter(bool, [
+ os.getenv('MKLDNN_INCLUDE_DIR'),
+ os.path.join(MKLDNN_HOME, 'include'),
+ '/usr/include/',
+ ] + gather_paths([
+ 'CPATH',
+ 'C_INCLUDE_PATH',
+ 'CPLUS_INCLUDE_PATH',
+ ])))
+ if IS_CONDA:
+ lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
+ include_paths.append(os.path.join(CONDA_DIR, 'include'))
+ for path in lib_paths:
+ if path is None or not os.path.exists(path):
+ continue
+ else:
+ libraries = sorted(glob.glob(os.path.join(path, 'libmkldnn*')))
+ if libraries:
+ if not glob.glob(os.path.join(path, 'libmklml_intel*')):
+ print("WARNING: MKL-DNN is not compiled with Intel MKL small library")
+ print("Convolution performance might be suboptimal")
+ print("Refer https://github.com/01org/mkl-dnn for detail info")
+ MKLDNN_LIBRARY = libraries[0]
+ MKLDNN_LIB_DIR = path
+ break
+ for path in include_paths:
+ if path is None or not os.path.exists(path):
+ continue
+ else:
+ if os.path.exists(os.path.join(path, 'mkldnn.hpp')):
+ MKLDNN_INCLUDE_DIR = path
+ break
+
+ # Specifying the library directly will overwrite the lib directory
+ library = os.getenv('MKLDNN_LIBRARY')
+ if library is not None and os.path.exists(library):
+ MKLDNN_LIBRARY = library
+ MKLDNN_LIB_DIR = os.path.dirname(MKLDNN_LIBRARY)
+
+ if not all([MKLDNN_LIBRARY, MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR]):
+ MKLDNN_LIBRARY = MKLDNN_LIB_DIR = MKLDNN_INCLUDE_DIR = None
+ else:
+ real_mkldnn_library = os.path.realpath(MKLDNN_LIBRARY)
+ real_mkldnn_lib_dir = os.path.realpath(MKLDNN_LIB_DIR)
+ assert os.path.dirname(real_mkldnn_library) == real_mkldnn_lib_dir, (
+ 'cudnn library and lib_dir must agree')
+ WITH_MKLDNN = True