Enable MKLDNN convolution forward and backward (#6062)

* Enable MKLDNN convolution forward and backward * minor change * fix mkldnn build error when building ATen standalone
author: Ma Mingfei <mingfei.ma@intel.com> 2018-03-30 06:25:07 +0800
committer: Soumith Chintala <soumith@gmail.com> 2018-03-29 15:25:07 -0700
commit: f8270c0225e19403038aec2d8af2697a2b5326ec (patch)
tree: 7dbc688871d3943a2ec3f2d896e8d4e0637d990b
parent: e4c0bb1809fd9bf9161392bfff7d06092adc224d (diff)
download: pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.tar.gz
pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.tar.bz2
pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.zip
13 files changed, 705 insertions, 2 deletions
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 16e7cc6790..13e4abca94 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -460,6 +460,20 @@ ELSE()
   set(AT_CUDNN_ENABLED 1)
 ENDIF()
 
+if(NO_MKLDNN)
+  message("disabling MKLDNN because NO_MKLDNN is set")
+  set(AT_MKLDNN_ENABLED 0)
+else()
+  find_package(MKLDNN)
+  if(NOT MKLDNN_FOUND)
+    message(STATUS "MKLDNN not found. Compiling without MKLDNN support")
+    set(AT_MKLDNN_ENABLED 0)
+  else()
+    INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIRS})
+    set(AT_MKLDNN_ENABLED 1)
+  endif()
+endif()
+
 if(NO_NNPACK)
   message("disabling NNPACK because NO_NNPACK is set")
   set(AT_NNPACK_ENABLED 0)
diff --git a/aten/cmake/FindMKLDNN.cmake b/aten/cmake/FindMKLDNN.cmake
new file mode 100644
index 0000000000..0862d5a3ac
--- /dev/null
+++ b/aten/cmake/FindMKLDNN.cmake
@@ -0,0 +1,32 @@
+# - Try to find MKLDNN
+#
+# The following variables are optionally searched for defaults
+#  MKLDNN_ROOT_DIR:            Base directory where all MKLDNN components are found
+#
+# The following are set after configuration is done:
+#  MKLDNN_FOUND
+#  MKLDNN_INCLUDE_DIRS
+#  MKLDNN_LIBRARIES
+#  MKLDNN_LIBRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(MKLDNN_ROOT_DIR "" CACHE PATH "Folder contains Intel MKLDNN")
+
+find_path(MKLDNN_INCLUDE_DIR mkldnn.h
+    HINTS ${MKLDNN_ROOT_DIR}
+    PATH_SUFFIXES include)
+
+find_library(MKLDNN_LIBRARY mkldnn
+    HINTS ${MKLDNN_LIB_DIR} ${MKLDNN_ROOT_DIR}
+    PATH_SUFFIXES lib lib64)
+
+find_package_handle_standard_args(
+    MKLDNN DEFAULT_MSG MKLDNN_INCLUDE_DIR MKLDNN_LIBRARY)
+
+if(MKLDNN_FOUND)
+  set(MKLDNN_INCLUDE_DIRS ${MKLDNN_INCLUDE_DIR})
+  set(MKLDNN_LIBRARIES ${MKLDNN_LIBRARY})
+  message(STATUS "Found MKLDNN      (include: ${MKLDNN_INCLUDE_DIR}, library: ${MKLDNN_LIBRARY})")
+  mark_as_advanced(MKLDNN_ROOT_DIR MKLDNN_LIBRARY MKLDNN_INCLUDE_DIR)
+endif()
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index daf986c6bd..27f1d98461 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -149,6 +149,7 @@ FILE(GLOB native_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/*.cpp")
 FILE(GLOB native_cudnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/cudnn/*.cpp")
 FILE(GLOB native_cuda_cu RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/cuda/*.cu")
 FILE(GLOB native_mkl_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/mkl/*.cpp")
+FILE(GLOB native_mkldnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "native/mkldnn/*.cpp")
 
 FILE(GLOB_RECURSE cuda_h
      RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -156,6 +157,7 @@ FILE(GLOB_RECURSE cuda_h
 
 FILE(GLOB cudnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "cudnn/*.cpp")
 FILE(GLOB mkl_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "mkl/*.cpp")
+FILE(GLOB mkldnn_cpp RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "mkldnn/*.cpp")
 
 FILE(GLOB all_python RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.py")
 
@@ -201,7 +203,7 @@ ADD_CUSTOM_TARGET(aten_files_are_generated
 )
 
 
-SET(all_cpp ${base_cpp} ${native_cpp} ${native_cudnn_cpp} ${native_mkl_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+SET(all_cpp ${base_cpp} ${native_cpp} ${native_cudnn_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
 
 INCLUDE_DIRECTORIES(${ATen_CPU_INCLUDE})
 IF(NOT NO_CUDA)
@@ -218,6 +220,10 @@ IF(NOT NO_CUDA)
   ENDIF()
 endif()
 
+IF(AT_MKLDNN_ENABLED)
+  SET(all_cpp ${all_cpp} ${mkldnn_cpp})
+ENDIF()
+
 filter_list(generated_h generated_cpp "\\.h$")
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/..)
@@ -315,6 +321,9 @@ if (NNPACK_FOUND)
   target_link_libraries(ATen ${NNPACK_LIBRARIES})
 endif(NNPACK_FOUND)
 
+if(MKLDNN_FOUND)
+  target_link_libraries(ATen ${MKLDNN_LIBRARIES})
+endif(MKLDNN_FOUND)
 
 # ---[ Configure cpuinfo
 IF(NOT TARGET cpuinfo)
@@ -326,7 +335,6 @@ IF(NOT TARGET cpuinfo)
 ENDIF()
 TARGET_LINK_LIBRARIES(ATen cpuinfo)
 
-
 IF(CUDA_FOUND)
   TARGET_LINK_LIBRARIES(ATen
     ${CUDA_LIBRARIES}
diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in
index d62468fd70..1ab0ec9162 100644
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@@ -6,6 +6,7 @@
 
 #define AT_CUDA_ENABLED() @AT_CUDA_ENABLED@
 #define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
+#define AT_MKLDNN_ENABLED() @AT_MKLDNN_ENABLED@
 #define AT_NNPACK_ENABLED() @AT_NNPACK_ENABLED@
 #define AT_MKL_ENABLED() @AT_MKL_ENABLED@
 
diff --git a/aten/src/ATen/mkldnn/Runtime.cpp b/aten/src/ATen/mkldnn/Runtime.cpp
new file mode 100644
index 0000000000..54f999ed14
--- /dev/null
+++ b/aten/src/ATen/mkldnn/Runtime.cpp
@@ -0,0 +1,5 @@
+#include "Runtime.h"
+
+namespace at { namespace native {
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/mkldnn/Runtime.h b/aten/src/ATen/mkldnn/Runtime.h
new file mode 100644
index 0000000000..c58ef2c56f
--- /dev/null
+++ b/aten/src/ATen/mkldnn/Runtime.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <mkldnn.hpp>
+
+using namespace mkldnn;
+
+namespace at { namespace native {
+
+// CpuEngine singleton
+struct CpuEngine {
+  static CpuEngine& Instance() {
+    static CpuEngine myInstance;
+    return myInstance;
+  }
+  engine& get_engine() {
+    return _cpu_engine;
+  }
+  CpuEngine(CpuEngine const&) = delete;
+  CpuEngine& operator=(CpuEngine const&) = delete;
+
+protected:
+  CpuEngine():_cpu_engine(mkldnn::engine::cpu, 0) {}
+  ~CpuEngine() {}
+
+private:
+  engine _cpu_engine;
+};
+
+// Stream singleton
+struct Stream {
+  static Stream& Instance() {
+    static Stream myInstance;
+    return myInstance;
+  };
+  stream& get_stream() {
+    return _cpu_stream;
+  }
+  Stream(Stream const&) = delete;
+  Stream& operator=(Stream const&) = delete;
+
+protected:
+  Stream():_cpu_stream(mkldnn::stream::kind::eager) {}
+  ~Stream() {}
+
+private:
+  stream _cpu_stream;
+};
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index b65759249a..33f7e05d26 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -32,6 +32,7 @@ struct ConvParams {
   bool is_padding_neg() const;
   void view1d_as_2d();
   bool use_cudnn(const at::Tensor& input) const;
+  bool use_mkldnn(const at::Tensor& input) const;
   bool use_nnpack(const at::Tensor& input) const;
   bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
 };
@@ -130,6 +131,17 @@ auto ConvParams::use_cudnn(const at::Tensor& input) const -> bool {
   return false;
 }
 
+auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool {
+#if AT_MKLDNN_ENABLED()
+  return input.type().backend() == kCPU &&
+         input.type().scalarType() == kFloat && // only on CPU Float Tensors
+         !is_dilated() && // doesn't support dilation
+         !transposed && // or transposed tensors
+         input.ndimension() == 4 && // must be in NCHW format
+         groups == 1;
+#endif
+  return false;
+}
 auto ConvParams::use_nnpack(const at::Tensor& input) const -> bool {
 #if AT_NNPACK_ENABLED()
   return input.type().backend() == kCPU &&
@@ -371,6 +383,21 @@ at::Tensor _convolution(
           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
     }
 #endif
+  } else if (params.use_mkldnn(input)) {
+#if AT_MKLDNN_ENABLED()
+    if (input.type() != weight.type()){
+      std::stringstream ss;
+      ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same";
+      throw std::runtime_error(ss.str());
+    }
+    if (bias.defined() && input.type() != bias.type()){
+      std::stringstream ss;
+      ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same";
+      throw std::runtime_error(ss.str());
+    }
+
+    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation);
+#endif
   } else {
     if (params.groups == 1) {
       output = at::_convolution_nogroup(
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
new file mode 100644
index 0000000000..25cddef9ae
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -0,0 +1,441 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+
+#if !AT_MKLDNN_ENABLED()
+
+namespace at { namespace native {
+
+at::Tensor mkldnn_convolution(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
+    IntList padding, IntList stride, IntList dilation) {
+  throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
+}
+
+at::Tensor mkldnn_convolution_backward_input(
+    IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, bool bias_defined) {
+  throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor> mkldnn_convolution_backward_weights(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, bool bias_defined) {
+  throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) {
+  throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
+}
+
+}}
+
+#else // AT_MKLDNN_EBABLED
+
+#include <ATen/mkldnn/Runtime.h>
+
+using namespace mkldnn;
+
+namespace at { namespace native {
+
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+std::vector<int64_t> conv_output_size(
+    IntList input_size, IntList weight_size,
+    IntList padding, IntList stride, IntList dilation)
+{
+  auto dim = input_size.size();
+  std::vector<int64_t> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (size_t d = 2; d < dim; ++d) {
+    auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2])
+                        - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+
+at::Tensor mkldnn_convolution(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
+    IntList padding, IntList stride, IntList dilation)
+{
+  auto output = input.type().tensor(conv_output_size(
+    input.sizes(), weight.sizes(), padding, stride, dilation));
+
+  auto cpu_engine = CpuEngine::Instance().get_engine();
+  
+  int32_t n = input.size(0);
+  int32_t ic = input.size(1);
+  int32_t ih = input.size(2);
+  int32_t iw = input.size(3);
+
+  int32_t oc = output.size(1);
+  int32_t oh = output.size(2);
+  int32_t ow = output.size(3);
+
+  int32_t kh = weight.size(2);
+  int32_t kw = weight.size(3);
+
+  int32_t sh = stride[0];
+  int32_t sw = stride[1];
+  int32_t ph = padding[0];
+  int32_t pw = padding[1];
+
+  auto data_t = memory::data_type::f32;
+  auto format_any = memory::format::any;
+  auto format_nchw = memory::format::nchw;
+  auto format_oihw = memory::format::oihw;
+  auto format_x = memory::format::x;
+
+  memory::dims input_tz = {n, ic, ih, iw};
+  memory::dims weight_tz = {oc, ic, kh, kw};
+  memory::dims bias_tz = {oc};
+  memory::dims output_tz = {n, oc, oh, ow};
+  memory::dims _stride = {sh, sw};
+  memory::dims _padding = {ph, pw};
+
+  auto input_md = memory::desc({input_tz}, data_t, format_any);
+  auto weight_md = memory::desc({weight_tz}, data_t, format_any);
+  auto bias_md = memory::desc({bias_tz}, data_t, format_any);
+  auto output_md = memory::desc({output_tz}, data_t, format_any);
+
+  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+  if (bias.defined()) {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+  conv_forward_pd.reset(new convolution_forward::primitive_desc(
+    *conv_forward_desc, cpu_engine));
+
+  auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+    input.data_ptr());
+  auto weight_usr_memory = memory({{{weight_tz}, data_t,  format_oihw}, cpu_engine},
+    weight.data_ptr());
+  auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+    output.data_ptr());
+
+  std::vector<primitive> net;
+
+  auto input_pd = conv_forward_pd->src_primitive_desc();
+  auto input_memory = input_usr_memory;
+  if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
+    input_memory = memory(input_pd);
+    net.push_back(reorder(input_usr_memory, input_memory));
+  }
+
+  auto weight_pd = conv_forward_pd->weights_primitive_desc();
+  auto weight_memory = weight_usr_memory;
+  if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
+    weight_memory = memory(weight_pd);
+    net.push_back(reorder(weight_usr_memory, weight_memory));
+  }
+
+  auto output_pd = conv_forward_pd->dst_primitive_desc();
+  auto output_memory = output_usr_memory;
+  if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) {
+    output_memory = memory(output_pd);
+  }
+
+  std::shared_ptr<convolution_forward> conv_forward;
+  std::shared_ptr<memory> bias_usr_memory;
+  if (bias.defined()) {
+    bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
+      bias.data_ptr()));
+    conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
+      weight_memory, *bias_usr_memory, output_memory));
+  } else {
+    conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory,
+      weight_memory, output_memory));
+  }
+  net.push_back(*conv_forward);
+
+  if (output_memory != output_usr_memory) {
+    net.push_back(reorder(output_memory, output_usr_memory));
+  }
+
+  Stream::Instance().get_stream().submit(net);
+
+  return output;
+}
+
+Tensor mkldnn_convolution_backward_input(
+    IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, bool bias_defined)
+{
+  auto grad_input = grad_output.type().tensor(input_size);
+
+  auto cpu_engine = CpuEngine::Instance().get_engine();
+
+  int32_t n = grad_input.size(0);
+  int32_t ic = grad_input.size(1);
+  int32_t ih = grad_input.size(2);
+  int32_t iw = grad_input.size(3);
+
+  int32_t oc = grad_output.size(1);
+  int32_t oh = grad_output.size(2);
+  int32_t ow = grad_output.size(3);
+
+  int32_t kh = weight.size(2);
+  int32_t kw = weight.size(3);
+
+  int32_t sh = stride[0];
+  int32_t sw = stride[1];
+  int32_t ph = padding[0];
+  int32_t pw = padding[1];
+
+  auto data_t = memory::data_type::f32;
+  auto format_any = memory::format::any;
+  auto format_nchw = memory::format::nchw;
+  auto format_oihw = memory::format::oihw;
+
+  memory::dims input_tz = {n, ic, ih, iw};
+  memory::dims weight_tz = {oc, ic, kh, kw};
+  memory::dims bias_tz = {oc};
+  memory::dims output_tz = {n, oc, oh, ow};
+  memory::dims _stride = {sh, sw};
+  memory::dims _padding = {ph, pw};
+
+  auto input_md = memory::desc({input_tz}, data_t, format_any);
+  auto weight_md = memory::desc({weight_tz}, data_t, format_any);
+  auto bias_md = memory::desc({bias_tz}, data_t, format_any);
+  auto output_md = memory::desc({output_tz}, data_t, format_any);
+
+  // need to re-create conv_forward_pd to feed conv_backward_data_pd
+  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+  if (bias_defined) {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+  conv_forward_pd.reset(new convolution_forward::primitive_desc(
+    *conv_forward_desc, cpu_engine));
+
+  std::shared_ptr<convolution_backward_data::desc> conv_backward_data_desc;
+  conv_backward_data_desc.reset(new convolution_backward_data::desc(
+    convolution_direct, input_md, weight_md, output_md,
+    _stride, _padding, _padding, padding_kind::zero));
+
+  std::shared_ptr<convolution_backward_data::primitive_desc> conv_backward_data_pd;
+  conv_backward_data_pd.reset(new convolution_backward_data::primitive_desc(
+    *conv_backward_data_desc, cpu_engine, *conv_forward_pd));
+
+  auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+    grad_output.data_ptr());
+  auto weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+    weight.data_ptr());
+  auto grad_input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+    grad_input.data_ptr());
+
+  std::vector<primitive> net;
+
+  auto grad_output_pd = conv_backward_data_pd->diff_dst_primitive_desc();
+  auto grad_output_memory = grad_output_usr_memory;
+  if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
+    grad_output_memory = memory(grad_output_pd);
+    net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
+  }
+
+  auto weight_pd = conv_backward_data_pd->weights_primitive_desc();
+  auto weight_memory = weight_usr_memory;
+  if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) {
+    weight_memory = memory(weight_pd);
+    net.push_back(reorder(weight_usr_memory, weight_memory));
+  }
+
+  auto grad_input_pd = conv_backward_data_pd->diff_src_primitive_desc();
+  auto grad_input_memory = grad_input_usr_memory;
+  if (grad_input_memory.get_primitive_desc() != memory::primitive_desc(grad_input_pd)) {
+    grad_input_memory = memory(grad_input_pd);
+  }
+
+  std::shared_ptr<convolution_backward_data> conv_backward_data;
+  conv_backward_data.reset(new convolution_backward_data(*conv_backward_data_pd,
+    grad_output_memory, weight_memory, grad_input_memory));
+  net.push_back(*conv_backward_data);
+
+  if (grad_input_memory != grad_input_usr_memory) {
+    net.push_back(reorder(grad_input_memory, grad_input_usr_memory));
+  }
+
+  Stream::Instance().get_stream().submit(net);
+
+  return grad_input;
+}
+
+std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, bool bias_defined)
+{
+  auto grad_weight = grad_output.type().tensor(weight_size);
+
+  Tensor grad_bias;
+  if (bias_defined) {
+    grad_bias = grad_output.type().tensor({grad_output.size(1)});
+  }
+
+  auto cpu_engine = CpuEngine::Instance().get_engine();
+
+  int32_t n = input.size(0);
+  int32_t ic = input.size(1);
+  int32_t ih = input.size(2);
+  int32_t iw = input.size(3);
+
+  int32_t oc = grad_output.size(1);
+  int32_t oh = grad_output.size(2);
+  int32_t ow = grad_output.size(3);
+
+  int32_t kh = grad_weight.size(2);
+  int32_t kw = grad_weight.size(3);
+
+  int32_t sh = stride[0];
+  int32_t sw = stride[1];
+  int32_t ph = padding[0];
+  int32_t pw = padding[1];
+
+  auto data_t = memory::data_type::f32;
+  auto format_any = memory::format::any;
+  auto format_nchw = memory::format::nchw;
+  auto format_oihw = memory::format::oihw;
+  auto format_x = memory::format::x;
+
+  memory::dims input_tz = {n, ic, ih, iw};
+  memory::dims weight_tz = {oc, ic, kh, kw};
+  memory::dims bias_tz = {oc};
+  memory::dims output_tz = {n, oc, oh, ow};
+  memory::dims _stride = {sh, sw};
+  memory::dims _padding = {ph, pw};
+
+  memory::desc input_md({input_tz}, data_t, format_any);
+  memory::desc weight_md({weight_tz}, data_t, format_any);
+  memory::desc bias_md({bias_tz}, data_t, format_any);
+  memory::desc output_md({output_tz}, data_t, format_any);
+
+  // need to re-create conv_forward_pd to feed conv_backward_weight_pd
+  std::shared_ptr<convolution_forward::desc> conv_forward_desc;
+  if (bias_defined) {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward,
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd;
+  conv_forward_pd.reset(new convolution_forward::primitive_desc(
+    *conv_forward_desc, cpu_engine));
+
+  std::shared_ptr<convolution_backward_weights::desc> conv_backward_weight_desc;
+  if (bias_defined) {
+    conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
+      convolution_direct, input_md, weight_md, bias_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  } else {
+    conv_backward_weight_desc.reset(new convolution_backward_weights::desc(
+      convolution_direct, input_md, weight_md, output_md,
+      _stride, _padding, _padding, padding_kind::zero));
+  }
+
+  std::shared_ptr<convolution_backward_weights::primitive_desc> conv_backward_weight_pd;
+  conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc(
+    *conv_backward_weight_desc, cpu_engine, *conv_forward_pd));
+
+  auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
+    input.data_ptr());
+  auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
+    grad_output.data_ptr());
+  auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+    grad_weight.data_ptr());
+  std::shared_ptr<memory> grad_bias_memory;
+
+  std::vector<primitive> net;
+
+  auto input_pd = conv_backward_weight_pd->src_primitive_desc();
+  auto input_memory = input_usr_memory;
+  if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) {
+    input_memory = memory(input_pd);
+    net.push_back(reorder(input_usr_memory, input_memory));
+  }
+
+  auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc();
+  auto grad_output_memory = grad_output_usr_memory;
+  if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) {
+    grad_output_memory = memory(grad_output_pd);
+    net.push_back(reorder(grad_output_usr_memory, grad_output_memory));
+  }
+
+  auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc();
+  auto grad_weight_memory = grad_weight_usr_memory;
+  if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) {
+    grad_weight_memory = memory(grad_weight_pd);
+  }
+
+  std::shared_ptr<convolution_backward_weights> conv_backward_weight;
+  if (bias_defined) {
+    grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine},
+      grad_bias.data_ptr()));
+    conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
+      input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory));
+  } else {
+    conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd,
+      input_memory, grad_output_memory, grad_weight_memory));
+  }
+
+  net.push_back(*conv_backward_weight);
+
+  if (grad_weight_memory != grad_weight_usr_memory) {
+    net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory));
+  }
+
+  Stream::Instance().get_stream().submit(net);
+
+  return std::tuple<at::Tensor, at::Tensor>{grad_weight, grad_bias};
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask)
+{
+  Tensor grad_output = grad_output_t.contiguous();
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::mkldnn_convolution_backward_input(
+      input.sizes(), grad_output, weight, padding, stride, dilation, output_mask[2]);
+  }
+  if (output_mask[1] || output_mask[2]) {
+    std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights(
+      weight.sizes(), grad_output, input, padding, stride, dilation, output_mask[2]);
+  }
+
+  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+}}  // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7e665909d7..93fa94eac3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -711,3 +711,15 @@
   dispatch:
     CPU: _s_poisson_cpu
     CUDA: _s_poisson_cuda
+
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation) -> Tensor
+  variants: function
+
+- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, bool bias_defined) -> Tensor
+  variants: function
+
+- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, bool bias_defined) -> (Tensor, Tensor)
+  variants: function
+
+- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+  variants: function
diff --git a/setup.py b/setup.py
index a5bda0f2b2..68f134f62a 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@
 #   NO_CUDNN
 #     disables the cuDNN build
 #
+#   NO_MKLDNN
+#     disables the MKLDNN build
+#
 #   NO_NNPACK
 #     disables NNPACK build
 #
@@ -67,6 +70,11 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
+#   MKLDNN_LIB_DIR
+#   MKLDNN_LIBRARY
+#   MKLDNN_INCLUDE_DIR
+#     specify where MKLDNN is installed
+#
 #   NVTOOLSEXT_PATH (Windows only)
 #     specify where nvtoolsext is installed
 #
@@ -99,6 +107,8 @@ from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
 from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
     NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
+from tools.setup_helpers.mkldnn import (WITH_MKLDNN, MKLDNN_LIBRARY,
+                                        MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR)
 from tools.setup_helpers.nnpack import WITH_NNPACK
 from tools.setup_helpers.nvtoolext import NVTOOLEXT_HOME
 from tools.setup_helpers.generate_code import generate_code
@@ -214,6 +224,11 @@ def build_libs(libs):
         my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
         my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
         my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
+    if WITH_MKLDNN:
+        my_env["MKLDNN_LIB_DIR"] = MKLDNN_LIB_DIR
+        my_env["MKLDNN_LIBRARY"] = MKLDNN_LIBRARY
+        my_env["MKLDNN_INCLUDE_DIR"] = MKLDNN_INCLUDE_DIR
+        build_libs_cmd += ['--with-mkldnn']
 
     if WITH_GLOO_IBVERBS:
         build_libs_cmd += ['--with-gloo-ibverbs']
@@ -397,6 +412,10 @@ class build_ext(build_ext_parent):
             print('-- Detected CUDA at ' + CUDA_HOME)
         else:
             print('-- Not using CUDA')
+        if WITH_MKLDNN:
+            print('-- Detected MKLDNN at ' + MKLDNN_LIBRARY + ', ' + MKLDNN_INCLUDE_DIR)
+        else:
+            print('-- Not using MKLDNN')
         if WITH_NCCL and WITH_SYSTEM_NCCL:
             print('-- Using system provided NCCL library at ' +
                   NCCL_SYSTEM_LIB + ', ' + NCCL_INCLUDE_DIR)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 98a46b4446..1cb4afb111 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1126,3 +1126,7 @@
 
 - name: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, Tensor dropout_state)
   input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
+
+# mkldnn
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList stride, IntList dilation)
+  self, weight, bias: mkldnn_convolution_backward(self, grad, weight, padding, stride, dilation, grad_input_mask)
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index cb403ca287..34f23317d8 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -23,6 +23,12 @@ if [[ "$1" == "--with-nnpack" ]]; then
   shift
 fi
 
+WITH_MKLDNN=0
+if [[ "$1" == "--with-mkldnn" ]]; then
+  WITH_MKLDNN=1
+  shift
+fi
+
 WITH_GLOO_IBVERBS=0
 if [[ "$1" == "--with-gloo-ibverbs" ]]; then
   WITH_GLOO_IBVERBS=1
@@ -202,6 +208,10 @@ function build_aten() {
       -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
       -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
       -DCUDNN_LIBRARY=$CUDNN_LIBRARY \
+      -DNO_MKLDNN=$((1-$WITH_MKLDNN)) \
+      -DMKLDNN_INCULDE_DIR=$MKLDNN_INCLUDE_DIR \
+      -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
+      -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
       -DATEN_NO_CONTRIB=1 \
       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
diff --git a/tools/setup_helpers/mkldnn.py b/tools/setup_helpers/mkldnn.py
new file mode 100644
index 0000000000..2dd5445278
--- /dev/null
+++ b/tools/setup_helpers/mkldnn.py
@@ -0,0 +1,81 @@
+import platform
+import glob
+import os
+import sys
+
+from itertools import chain
+from .env import check_env_flag
+
+
+def gather_paths(env_vars):
+    return list(chain(*(os.getenv(v, '').split(':') for v in env_vars)))
+
+IS_LINUX = platform.system() == 'Linux'
+IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version
+CONDA_DIR = os.path.join(os.path.dirname(sys.executable), '..')
+
+MKLDNN_HOME = os.getenv('MKLDNN_HOME', '/usr/local/mkl-dnn')
+
+WITH_MKLDNN = False
+MKLDNN_LIB_DIR = None
+MKLDNN_INCLUDE_DIR = None
+MKLDNN_LIBRARY = None
+if IS_LINUX and not check_env_flag('NO_MKLDNN'):
+    lib_paths = list(filter(bool, [
+        os.getenv('MKLDNN_LIB_DIR'),
+        os.path.join(MKLDNN_HOME, 'lib'),
+        os.path.join(MKLDNN_HOME, 'lib64'),
+        '/usr/lib/',
+        '/usr/lib64/',
+    ] + gather_paths([
+        'LIBRARY_PATH',
+    ]) + gather_paths([
+        'LD_LIBRARY_PATH',
+    ])))
+    include_paths = list(filter(bool, [
+        os.getenv('MKLDNN_INCLUDE_DIR'),
+        os.path.join(MKLDNN_HOME, 'include'),
+        '/usr/include/',
+    ] + gather_paths([
+        'CPATH',
+        'C_INCLUDE_PATH',
+        'CPLUS_INCLUDE_PATH',
+    ])))
+    if IS_CONDA:
+        lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
+        include_paths.append(os.path.join(CONDA_DIR, 'include'))
+    for path in lib_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        else:
+            libraries = sorted(glob.glob(os.path.join(path, 'libmkldnn*')))
+            if libraries:
+                if not glob.glob(os.path.join(path, 'libmklml_intel*')):
+                    print("WARNING: MKL-DNN is not compiled with Intel MKL small library")
+                    print("Convolution performance might be suboptimal")
+                    print("Refer https://github.com/01org/mkl-dnn for detail info")
+                MKLDNN_LIBRARY = libraries[0]
+                MKLDNN_LIB_DIR = path
+                break
+    for path in include_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        else:
+            if os.path.exists(os.path.join(path, 'mkldnn.hpp')):
+                MKLDNN_INCLUDE_DIR = path
+                break
+
+    # Specifying the library directly will overwrite the lib directory
+    library = os.getenv('MKLDNN_LIBRARY')
+    if library is not None and os.path.exists(library):
+        MKLDNN_LIBRARY = library
+        MKLDNN_LIB_DIR = os.path.dirname(MKLDNN_LIBRARY)
+
+    if not all([MKLDNN_LIBRARY, MKLDNN_LIB_DIR, MKLDNN_INCLUDE_DIR]):
+        MKLDNN_LIBRARY = MKLDNN_LIB_DIR = MKLDNN_INCLUDE_DIR = None
+    else:
+        real_mkldnn_library = os.path.realpath(MKLDNN_LIBRARY)
+        real_mkldnn_lib_dir = os.path.realpath(MKLDNN_LIB_DIR)
+        assert os.path.dirname(real_mkldnn_library) == real_mkldnn_lib_dir, (
+            'cudnn library and lib_dir must agree')
+        WITH_MKLDNN = True
author	Ma Mingfei <mingfei.ma@intel.com>	2018-03-30 06:25:07 +0800
committer	Soumith Chintala <soumith@gmail.com>	2018-03-29 15:25:07 -0700
commit	f8270c0225e19403038aec2d8af2697a2b5326ec (patch)
tree	7dbc688871d3943a2ec3f2d896e8d4e0637d990b
parent	e4c0bb1809fd9bf9161392bfff7d06092adc224d (diff)
download	pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.tar.gz pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.tar.bz2 pytorch-f8270c0225e19403038aec2d8af2697a2b5326ec.zip