9 files changed, 216 insertions, 112 deletions
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index a649d8cc..c263997a 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -7,23 +7,27 @@ namespace caffe {
 
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, Dtype* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_im);
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, Dtype* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_im);
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 05c7b45e..80dc3754 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -45,11 +45,11 @@ class ConvolutionLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
-  int kernel_size_;
-  int stride_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
   int num_;
   int channels_;
-  int pad_;
+  int pad_h_, pad_w_;
   int height_;
   int width_;
   int num_output_;
@@ -119,12 +119,12 @@ class Im2colLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
 
-  int kernel_size_;
-  int stride_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
   int channels_;
   int height_;
   int width_;
-  int pad_;
+  int pad_h_, pad_w_;
 };
 
 /* InnerProductLayer
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 11f29e78..79b040f0 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -14,10 +14,42 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Layer<Dtype>::SetUp(bottom, top);
-  kernel_size_ = this->layer_param_.convolution_param().kernel_size();
-  stride_ = this->layer_param_.convolution_param().stride();
+  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+  CHECK(!conv_param.has_kernel_size() !=
+      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(conv_param.has_kernel_size() ||
+      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+      && conv_param.has_pad_w())
+      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+      && conv_param.has_stride_w())
+      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  } else {
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
+  } else {
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
+  }
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
+  } else {
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
+  }
   group_ = this->layer_param_.convolution_param().group();
-  pad_ = this->layer_param_.convolution_param().pad();
   num_ = bottom[0]->num();
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
@@ -37,17 +69,18 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_EQ(channels_ % group_, 0);
   // The im2col result buffer would only hold one image at a time to avoid
   // overly large memory usage.
-  int height_out = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
-  int width_out = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+  int height_out =
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  int width_out = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
   col_buffer_.Reshape(
-      1, channels_ * kernel_size_ * kernel_size_, height_out, width_out);
+      1, channels_ * kernel_h_ * kernel_w_, height_out, width_out);
   // Set the parameters
   CHECK_EQ(num_output_ % group_, 0)
       << "Number of output should be multiples of group.";
   bias_term_ = this->layer_param_.convolution_param().bias_term();
   // Figure out the dimensions for individual gemms.
   M_ = num_output_ / group_;
-  K_ = channels_ * kernel_size_ * kernel_size_ / group_;
+  K_ = channels_ * kernel_h_ * kernel_w_ / group_;
   N_ = height_out * width_out;
   for (int top_id = 0; top_id < top->size(); ++top_id) {
     (*top)[top_id]->Reshape(num_, num_output_, height_out, width_out);
@@ -63,7 +96,7 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
     }
     // Intialize the weight
     this->blobs_[0].reset(new Blob<Dtype>(
-        num_output_, channels_ / group_, kernel_size_, kernel_size_));
+        num_output_, channels_ / group_, kernel_h_, kernel_w_));
     // fill the weights
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.convolution_param().weight_filler()));
@@ -99,7 +132,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     for (int n = 0; n < num_; ++n) {
       // First, im2col
       im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                        width_, kernel_size_, pad_, stride_, col_data);
+                        width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data);
+        stride_h_, stride_w_, col_data);
       // Second, innerproduct with groups
       for (int g = 0; g < group_; ++g) {
         caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
@@ -160,7 +194,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         // Since we saved memory in the forward pass by not storing all col
         // data, we will need to recompute them.
         im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                   width_, kernel_size_, pad_, stride_, col_data);
+                   width_, kernel_h_, kernel_w_ pad_h_, pad_w_, stride_h_, stride_w_, col_data);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           for (int g = 0; g < group_; ++g) {
@@ -179,8 +213,8 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                 (Dtype)0., col_diff + col_offset * g);
           }
           // col2im back to the data
-          col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
-              stride_, bottom_diff + (*bottom)[i]->offset(n));
+          col2im_cpu(col_diff, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+              stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n));
         }
       }
     }
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index d328fbde..86c047b4 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -24,7 +24,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     for (int n = 0; n < num_; ++n) {
       // First, im2col
       im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                        width_, kernel_size_, pad_, stride_, col_data);
+                        width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data);
+                      pad_w_, stride_h_, stride_w_, col_data);
       // Second, innerproduct with groups
       for (int g = 0; g < group_; ++g) {
         caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
@@ -65,6 +66,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_diff = NULL;
     // Bias gradient, if necessary.
     if (bias_term_ && this->param_propagate_down_[1]) {
+                      width_, kernel_h_, kernel_w_, pad_h_,
       top_diff = top[i]->gpu_diff();
       for (int n = 0; n < num_; ++n) {
         caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
@@ -85,7 +87,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         // Since we saved memory in the forward pass by not storing all col
         // data, we will need to recompute them.
         im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                   width_, kernel_size_, pad_, stride_, col_data);
+                   width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                   stride_h_, stride_w_, col_data);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           for (int g = 0; g < group_; ++g) {
@@ -104,8 +107,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
                 (Dtype)0., col_diff + col_offset * g);
           }
           // col2im back to the data
-          col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
-              stride_, bottom_diff + (*bottom)[i]->offset(n));
+          col2im_gpu(col_diff, channels_, height_, width_, kernel_h_, kernel_w, pad_h_, pad_w_,
+              stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n));
         }
       }
     }
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 2c86412b..fabce0a6 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -13,15 +13,48 @@ template <typename Dtype>
 void Im2colLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
       vector<Blob<Dtype>*>* top) {
   Layer<Dtype>::SetUp(bottom, top);
-  kernel_size_ = this->layer_param_.convolution_param().kernel_size();
-  stride_ = this->layer_param_.convolution_param().stride();
-  pad_ = this->layer_param_.convolution_param().pad();
+  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+  CHECK(!conv_param.has_kernel_size() !=
+      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(conv_param.has_kernel_size() ||
+      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+      && conv_param.has_pad_w())
+      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+      && conv_param.has_stride_w())
+      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  } else {
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
+  } else {
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
+  }
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
+  } else {
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
+  }
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
-  (*top)[0]->Reshape(bottom[0]->num(), channels_ * kernel_size_ * kernel_size_,
-      (height_ + 2 * pad_ - kernel_size_) / stride_ + 1,
-      (width_ + 2 * pad_ - kernel_size_) / stride_ + 1);
+  (*top)[0]->Reshape(
+      bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+      (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
 template <typename Dtype>
@@ -31,7 +64,8 @@ Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = (*top)[0]->mutable_cpu_data();
   for (int n = 0; n < bottom[0]->num(); ++n) {
     im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data + (*top)[0]->offset(n));
   }
   return Dtype(0.);
 }
@@ -43,7 +77,8 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
     col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n));
   }
 }
 
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
index 9cfb74e8..84439742 100644
--- a/src/caffe/layers/im2col_layer.cu
+++ b/src/caffe/layers/im2col_layer.cu
@@ -16,7 +16,8 @@ Dtype Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = (*top)[0]->mutable_gpu_data();
   for (int n = 0; n < bottom[0]->num(); ++n) {
     im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data + (*top)[0]->offset(n));
   }
   return Dtype(0.);
 }
@@ -28,7 +29,8 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
     col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n));
   }
 }
 
diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu
index 5671968b..37d1a152 100644
--- a/src/caffe/test/test_im2col_kernel.cu
+++ b/src/caffe/test/test_im2col_kernel.cu
@@ -17,8 +17,10 @@ namespace caffe {
 // Forward declare kernel functions
 template <typename Dtype>
 __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, const int height_col, const int width_col,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
     Dtype* data_col);
 
 extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
@@ -87,8 +89,10 @@ TYPED_TEST(Im2colKernelTest, TestGPU) {
   // CPU Version
   for (int n = 0; n < this->blob_bottom_->num(); ++n) {
     im2col_cpu(this->blob_bottom_->cpu_data() + this->blob_bottom_->offset(n),
-      this->channels_, this->height_, this->width_, this->kernel_size_,
-      this->pad_, this->stride_, cpu_data + this->blob_top_cpu_->offset(n));
+      this->channels_, this->height_, this->width_,
+      this->kernel_size_, this->kernel_size_, this->pad_, this->pad_,
+      this->stride_, this->stride_,
+      cpu_data + this->blob_top_cpu_->offset(n));
   }
 
   // GPU version
@@ -102,8 +106,9 @@ TYPED_TEST(Im2colKernelTest, TestGPU) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       im2col_gpu_kernel<TypeParam><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, bottom_data + this->blob_bottom_->offset(n),
-        this->height_, this->width_, this->kernel_size_, this->pad_,
-        this->stride_, this->height_col_, this->width_col_,
+        this->height_, this->width_, this->kernel_size_, this->kernel_size_,
+        this->pad_, this->pad_, this->stride_, this->stride_,
+        this->height_col_, this->width_col_,
         top_data + this->blob_top_->offset(n));
       CUDA_POST_KERNEL_CHECK;
     }
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index ce4e1884..ca1dc797 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -11,19 +11,21 @@ namespace caffe {
 
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col) {
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
-  int channels_col = channels * ksize * ksize;
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_col) {
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
   for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % ksize;
-    int h_offset = (c / ksize) % ksize;
-    int c_im = c / ksize / ksize;
+    int w_offset = c % kernel_h;
+    int h_offset = (c / kernel_h) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
     for (int h = 0; h < height_col; ++h) {
       for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride - pad + h_offset;
-        int w_pad = w * stride - pad + w_offset;
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
           data_col[(c * height_col + h) * width_col + w] =
             data_im[(c_im * height + h_pad) * width + w_pad];
@@ -36,28 +38,32 @@ void im2col_cpu(const Dtype* data_im, const int channels,
 
 // Explicit instantiation
 template void im2col_cpu<float>(const float* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_col);
 template void im2col_cpu<double>(const double* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_col);
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int ksize, const int pad,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
     const int stride, Dtype* data_im) {
   caffe_set(height * width * channels, Dtype(0), data_im);
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
-  int channels_col = channels * ksize * ksize;
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int channels_col = channels * patch_h * patch_w;
   for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % ksize;
-    int h_offset = (c / ksize) % ksize;
-    int c_im = c / ksize / ksize;
+    int w_offset = c % patch_h;
+    int h_offset = (c / patch_h) % patch_h;
+    int c_im = c / patch_h / patch_w;
     for (int h = 0; h < height_col; ++h) {
       for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride - pad + h_offset;
-        int w_pad = w * stride - pad + w_offset;
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
           data_im[(c_im * height + h_pad) * width + w_pad] +=
               data_col[(c * height_col + h) * width_col + w];
@@ -68,10 +74,12 @@ void col2im_cpu(const Dtype* data_col, const int channels,
 
 // Explicit instantiation
 template void col2im_cpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, float* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, double* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_im);
 
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index 79faa6cb..b565d2d3 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -12,23 +12,25 @@ namespace caffe {
 
 template <typename Dtype>
 __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, const int height_col, const int width_col,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
     Dtype* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
     int w_out = index % width_col;
     int h_index = index / width_col;
     int h_out = h_index % height_col;
     int channel_in = h_index / height_col;
-    int channel_out = channel_in * ksize * ksize;
-    int h_in = h_out * stride - pad;
-    int w_in = w_out * stride - pad;
+    int channel_out = channel_in * kernel_h * kernel_w;
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
     Dtype* data_col_ptr = data_col;
     data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
     const Dtype* data_im_ptr = data_im;
     data_im_ptr += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < ksize; ++i) {
-      for (int j = 0; j < ksize; ++j) {
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
         int h = h_in + i;
         int w = w_in + j;
         *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
@@ -41,17 +43,20 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col) {
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
   int num_kernels = channels * height_col * width_col;
   // NOLINT_NEXT_LINE(whitespace/operators)
   im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
                              CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, ksize, pad, stride, height_col,
+      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+      pad_w, stride_h, stride_w, height_col,
       width_col, data_col);
   CUDA_POST_KERNEL_CHECK;
 }
@@ -59,40 +64,46 @@ void im2col_gpu(const Dtype* data_im, const int channels,
 
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_col);
 template void im2col_gpu<double>(const double* data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_col);
 
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
-    const int height, const int width, const int channels, const int ksize,
-    const int pad, const int stride, const int height_col, const int width_col,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
     Dtype* data_im) {
   CUDA_KERNEL_LOOP(index, n) {
     Dtype val = 0;
-    int w = index % width + pad;
-    int h = (index / width) % height + pad;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
     int c = index / (width * height);
     // compute the start and end of the output
-    int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-    int w_col_end = min(w / stride + 1, width_col);
-    int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-    int h_col_end = min(h / stride + 1, height_col);
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
     /*
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * ksize * ksize + (h - h_col * stride) * ksize + (w - w_col * stride);
+        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
+            + (w - w_col * stride_w);
         val += data_col[(c_col * height_col + h_col) * width_col + w_col];
       }
     }
     */
     // equivalent implementation
-    int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-    int coeff_w_col = (1 - stride * height_col * width_col);
+    int offset = (c * patch_h * patch_w + h * patch_h + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col);
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
@@ -104,29 +115,31 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im) {
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_im) {
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
   int num_kernels = channels * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   // NOLINT_NEXT_LINE(whitespace/operators)
   col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
                              CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, ksize, pad, stride,
+      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      pad_h, pad_w, stride_h, stride_w,
       height_col, width_col, data_im);
   CUDA_POST_KERNEL_CHECK;
 }
 
-
 // Explicit instantiation
 template void col2im_gpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, float* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, double* data_im);
-
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_im);
 
 }  // namespace caffe