summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/caffe/util/im2col.hpp20
-rw-r--r--include/caffe/vision_layers.hpp12
-rw-r--r--src/caffe/layers/conv_layer.cpp58
-rw-r--r--src/caffe/layers/conv_layer.cu11
-rw-r--r--src/caffe/layers/im2col_layer.cpp51
-rw-r--r--src/caffe/layers/im2col_layer.cu6
-rw-r--r--src/caffe/test/test_im2col_kernel.cu17
-rw-r--r--src/caffe/util/im2col.cpp62
-rw-r--r--src/caffe/util/im2col.cu91
9 files changed, 216 insertions, 112 deletions
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index a649d8cc..c263997a 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -7,23 +7,27 @@ namespace caffe {
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, Dtype* data_col);
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, Dtype* data_col);
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
- const int height, const int width, const int psize, const int pad,
- const int stride, Dtype* data_im);
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, Dtype* data_im);
template <typename Dtype>
void im2col_gpu(const Dtype* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, Dtype* data_col);
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, Dtype* data_col);
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
- const int height, const int width, const int psize, const int pad,
- const int stride, Dtype* data_im);
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, Dtype* data_im);
} // namespace caffe
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 05c7b45e..80dc3754 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -45,11 +45,11 @@ class ConvolutionLayer : public Layer<Dtype> {
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
- int kernel_size_;
- int stride_;
+ int kernel_h_, kernel_w_;
+ int stride_h_, stride_w_;
int num_;
int channels_;
- int pad_;
+ int pad_h_, pad_w_;
int height_;
int width_;
int num_output_;
@@ -119,12 +119,12 @@ class Im2colLayer : public Layer<Dtype> {
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
- int kernel_size_;
- int stride_;
+ int kernel_h_, kernel_w_;
+ int stride_h_, stride_w_;
int channels_;
int height_;
int width_;
- int pad_;
+ int pad_h_, pad_w_;
};
/* InnerProductLayer
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 11f29e78..79b040f0 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -14,10 +14,42 @@ template <typename Dtype>
void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
Layer<Dtype>::SetUp(bottom, top);
- kernel_size_ = this->layer_param_.convolution_param().kernel_size();
- stride_ = this->layer_param_.convolution_param().stride();
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ CHECK(!conv_param.has_kernel_size() !=
+ !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+ << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+ CHECK(conv_param.has_kernel_size() ||
+ (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+ << "For non-square filters both kernel_h and kernel_w are required.";
+ CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+ && conv_param.has_pad_w())
+ || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+ << "pad is pad OR pad_h and pad_w are required.";
+ CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+ && conv_param.has_stride_w())
+ || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+ << "Stride is stride OR stride_h and stride_w are required.";
+ if (conv_param.has_kernel_size()) {
+ kernel_h_ = kernel_w_ = conv_param.kernel_size();
+ } else {
+ kernel_h_ = conv_param.kernel_h();
+ kernel_w_ = conv_param.kernel_w();
+ }
+ CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+ CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+ if (!conv_param.has_pad_h()) {
+ pad_h_ = pad_w_ = conv_param.pad();
+ } else {
+ pad_h_ = conv_param.pad_h();
+ pad_w_ = conv_param.pad_w();
+ }
+ if (!conv_param.has_stride_h()) {
+ stride_h_ = stride_w_ = conv_param.stride();
+ } else {
+ stride_h_ = conv_param.stride_h();
+ stride_w_ = conv_param.stride_w();
+ }
group_ = this->layer_param_.convolution_param().group();
- pad_ = this->layer_param_.convolution_param().pad();
num_ = bottom[0]->num();
channels_ = bottom[0]->channels();
height_ = bottom[0]->height();
@@ -37,17 +69,18 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
CHECK_EQ(channels_ % group_, 0);
// The im2col result buffer would only hold one image at a time to avoid
// overly large memory usage.
- int height_out = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
- int width_out = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+ int height_out =
+ (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+ int width_out = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
col_buffer_.Reshape(
- 1, channels_ * kernel_size_ * kernel_size_, height_out, width_out);
+ 1, channels_ * kernel_h_ * kernel_w_, height_out, width_out);
// Set the parameters
CHECK_EQ(num_output_ % group_, 0)
<< "Number of output should be multiples of group.";
bias_term_ = this->layer_param_.convolution_param().bias_term();
// Figure out the dimensions for individual gemms.
M_ = num_output_ / group_;
- K_ = channels_ * kernel_size_ * kernel_size_ / group_;
+ K_ = channels_ * kernel_h_ * kernel_w_ / group_;
N_ = height_out * width_out;
for (int top_id = 0; top_id < top->size(); ++top_id) {
(*top)[top_id]->Reshape(num_, num_output_, height_out, width_out);
@@ -63,7 +96,7 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
}
// Intialize the weight
this->blobs_[0].reset(new Blob<Dtype>(
- num_output_, channels_ / group_, kernel_size_, kernel_size_));
+ num_output_, channels_ / group_, kernel_h_, kernel_w_));
// fill the weights
shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
this->layer_param_.convolution_param().weight_filler()));
@@ -99,7 +132,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
for (int n = 0; n < num_; ++n) {
// First, im2col
im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
- width_, kernel_size_, pad_, stride_, col_data);
+ width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data);
+ stride_h_, stride_w_, col_data);
// Second, innerproduct with groups
for (int g = 0; g < group_; ++g) {
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
@@ -160,7 +194,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
// Since we saved memory in the forward pass by not storing all col
// data, we will need to recompute them.
im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
- width_, kernel_size_, pad_, stride_, col_data);
+ width_, kernel_h_, kernel_w_ pad_h_, pad_w_, stride_h_, stride_w_, col_data);
// gradient w.r.t. weight. Note that we will accumulate diffs.
if (this->param_propagate_down_[0]) {
for (int g = 0; g < group_; ++g) {
@@ -179,8 +213,8 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
(Dtype)0., col_diff + col_offset * g);
}
// col2im back to the data
- col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
- stride_, bottom_diff + (*bottom)[i]->offset(n));
+ col2im_cpu(col_diff, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+ stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n));
}
}
}
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index d328fbde..86c047b4 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -24,7 +24,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
for (int n = 0; n < num_; ++n) {
// First, im2col
im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
- width_, kernel_size_, pad_, stride_, col_data);
+ width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data);
+ pad_w_, stride_h_, stride_w_, col_data);
// Second, innerproduct with groups
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
@@ -65,6 +66,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const Dtype* top_diff = NULL;
// Bias gradient, if necessary.
if (bias_term_ && this->param_propagate_down_[1]) {
+ width_, kernel_h_, kernel_w_, pad_h_,
top_diff = top[i]->gpu_diff();
for (int n = 0; n < num_; ++n) {
caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
@@ -85,7 +87,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
// Since we saved memory in the forward pass by not storing all col
// data, we will need to recompute them.
im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
- width_, kernel_size_, pad_, stride_, col_data);
+ width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+ stride_h_, stride_w_, col_data);
// gradient w.r.t. weight. Note that we will accumulate diffs.
if (this->param_propagate_down_[0]) {
for (int g = 0; g < group_; ++g) {
@@ -104,8 +107,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
(Dtype)0., col_diff + col_offset * g);
}
// col2im back to the data
- col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_,
- stride_, bottom_diff + (*bottom)[i]->offset(n));
+ col2im_gpu(col_diff, channels_, height_, width_, kernel_h_, kernel_w, pad_h_, pad_w_,
+ stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n));
}
}
}
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 2c86412b..fabce0a6 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -13,15 +13,48 @@ template <typename Dtype>
void Im2colLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
Layer<Dtype>::SetUp(bottom, top);
- kernel_size_ = this->layer_param_.convolution_param().kernel_size();
- stride_ = this->layer_param_.convolution_param().stride();
- pad_ = this->layer_param_.convolution_param().pad();
+ ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+ CHECK(!conv_param.has_kernel_size() !=
+ !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+ << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+ CHECK(conv_param.has_kernel_size() ||
+ (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+ << "For non-square filters both kernel_h and kernel_w are required.";
+ CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+ && conv_param.has_pad_w())
+ || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+ << "pad is pad OR pad_h and pad_w are required.";
+ CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+ && conv_param.has_stride_w())
+ || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+ << "Stride is stride OR stride_h and stride_w are required.";
+ if (conv_param.has_kernel_size()) {
+ kernel_h_ = kernel_w_ = conv_param.kernel_size();
+ } else {
+ kernel_h_ = conv_param.kernel_h();
+ kernel_w_ = conv_param.kernel_w();
+ }
+ CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+ CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+ if (!conv_param.has_pad_h()) {
+ pad_h_ = pad_w_ = conv_param.pad();
+ } else {
+ pad_h_ = conv_param.pad_h();
+ pad_w_ = conv_param.pad_w();
+ }
+ if (!conv_param.has_stride_h()) {
+ stride_h_ = stride_w_ = conv_param.stride();
+ } else {
+ stride_h_ = conv_param.stride_h();
+ stride_w_ = conv_param.stride_w();
+ }
channels_ = bottom[0]->channels();
height_ = bottom[0]->height();
width_ = bottom[0]->width();
- (*top)[0]->Reshape(bottom[0]->num(), channels_ * kernel_size_ * kernel_size_,
- (height_ + 2 * pad_ - kernel_size_) / stride_ + 1,
- (width_ + 2 * pad_ - kernel_size_) / stride_ + 1);
+ (*top)[0]->Reshape(
+ bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+ (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+ (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
}
template <typename Dtype>
@@ -31,7 +64,8 @@ Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
Dtype* top_data = (*top)[0]->mutable_cpu_data();
for (int n = 0; n < bottom[0]->num(); ++n) {
im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
- width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
+ width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+ stride_h_, stride_w_, top_data + (*top)[0]->offset(n));
}
return Dtype(0.);
}
@@ -43,7 +77,8 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
for (int n = 0; n < top[0]->num(); ++n) {
col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
- kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
+ kernel_h_, kernel_w_, pad_h_, pad_w_,
+ stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n));
}
}
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
index 9cfb74e8..84439742 100644
--- a/src/caffe/layers/im2col_layer.cu
+++ b/src/caffe/layers/im2col_layer.cu
@@ -16,7 +16,8 @@ Dtype Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
Dtype* top_data = (*top)[0]->mutable_gpu_data();
for (int n = 0; n < bottom[0]->num(); ++n) {
im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
- width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
+ width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+ stride_h_, stride_w_, top_data + (*top)[0]->offset(n));
}
return Dtype(0.);
}
@@ -28,7 +29,8 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
for (int n = 0; n < top[0]->num(); ++n) {
col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
- kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n));
+ kernel_h_, kernel_w_, pad_h_, pad_w_,
+ stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n));
}
}
diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu
index 5671968b..37d1a152 100644
--- a/src/caffe/test/test_im2col_kernel.cu
+++ b/src/caffe/test/test_im2col_kernel.cu
@@ -17,8 +17,10 @@ namespace caffe {
// Forward declare kernel functions
template <typename Dtype>
__global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
- const int height, const int width, const int ksize, const int pad,
- const int stride, const int height_col, const int width_col,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int height_col, const int width_col,
Dtype* data_col);
extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
@@ -87,8 +89,10 @@ TYPED_TEST(Im2colKernelTest, TestGPU) {
// CPU Version
for (int n = 0; n < this->blob_bottom_->num(); ++n) {
im2col_cpu(this->blob_bottom_->cpu_data() + this->blob_bottom_->offset(n),
- this->channels_, this->height_, this->width_, this->kernel_size_,
- this->pad_, this->stride_, cpu_data + this->blob_top_cpu_->offset(n));
+ this->channels_, this->height_, this->width_,
+ this->kernel_size_, this->kernel_size_, this->pad_, this->pad_,
+ this->stride_, this->stride_,
+ cpu_data + this->blob_top_cpu_->offset(n));
}
// GPU version
@@ -102,8 +106,9 @@ TYPED_TEST(Im2colKernelTest, TestGPU) {
// NOLINT_NEXT_LINE(whitespace/operators)
im2col_gpu_kernel<TypeParam><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
num_kernels, bottom_data + this->blob_bottom_->offset(n),
- this->height_, this->width_, this->kernel_size_, this->pad_,
- this->stride_, this->height_col_, this->width_col_,
+ this->height_, this->width_, this->kernel_size_, this->kernel_size_,
+ this->pad_, this->pad_, this->stride_, this->stride_,
+ this->height_col_, this->width_col_,
top_data + this->blob_top_->offset(n));
CUDA_POST_KERNEL_CHECK;
}
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index ce4e1884..ca1dc797 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -11,19 +11,21 @@ namespace caffe {
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, Dtype* data_col) {
- int height_col = (height + 2 * pad - ksize) / stride + 1;
- int width_col = (width + 2 * pad - ksize) / stride + 1;
- int channels_col = channels * ksize * ksize;
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ Dtype* data_col) {
+ int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+ int channels_col = channels * kernel_h * kernel_w;
for (int c = 0; c < channels_col; ++c) {
- int w_offset = c % ksize;
- int h_offset = (c / ksize) % ksize;
- int c_im = c / ksize / ksize;
+ int w_offset = c % kernel_h;
+ int h_offset = (c / kernel_h) % kernel_h;
+ int c_im = c / kernel_h / kernel_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
- int h_pad = h * stride - pad + h_offset;
- int w_pad = w * stride - pad + w_offset;
+ int h_pad = h * stride_h - pad_h + h_offset;
+ int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h_pad) * width + w_pad];
@@ -36,28 +38,32 @@ void im2col_cpu(const Dtype* data_im, const int channels,
// Explicit instantiation
template void im2col_cpu<float>(const float* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, float* data_col);
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, float* data_col);
template void im2col_cpu<double>(const double* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, double* data_col);
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, double* data_col);
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
- const int height, const int width, const int ksize, const int pad,
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
const int stride, Dtype* data_im) {
caffe_set(height * width * channels, Dtype(0), data_im);
- int height_col = (height + 2 * pad - ksize) / stride + 1;
- int width_col = (width + 2 * pad - ksize) / stride + 1;
- int channels_col = channels * ksize * ksize;
+ int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+ int channels_col = channels * patch_h * patch_w;
for (int c = 0; c < channels_col; ++c) {
- int w_offset = c % ksize;
- int h_offset = (c / ksize) % ksize;
- int c_im = c / ksize / ksize;
+ int w_offset = c % patch_h;
+ int h_offset = (c / patch_h) % patch_h;
+ int c_im = c / patch_h / patch_w;
for (int h = 0; h < height_col; ++h) {
for (int w = 0; w < width_col; ++w) {
- int h_pad = h * stride - pad + h_offset;
- int w_pad = w * stride - pad + w_offset;
+ int h_pad = h * stride_h - pad_h + h_offset;
+ int w_pad = w * stride_w - pad_w + w_offset;
if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
data_im[(c_im * height + h_pad) * width + w_pad] +=
data_col[(c * height_col + h) * width_col + w];
@@ -68,10 +74,12 @@ void col2im_cpu(const Dtype* data_col, const int channels,
// Explicit instantiation
template void col2im_cpu<float>(const float* data_col, const int channels,
- const int height, const int width, const int psize, const int pad,
- const int stride, float* data_im);
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, float* data_im);
template void col2im_cpu<double>(const double* data_col, const int channels,
- const int height, const int width, const int psize, const int pad,
- const int stride, double* data_im);
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, double* data_im);
} // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index 79faa6cb..b565d2d3 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -12,23 +12,25 @@ namespace caffe {
template <typename Dtype>
__global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
- const int height, const int width, const int ksize, const int pad,
- const int stride, const int height_col, const int width_col,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int height_col, const int width_col,
Dtype* data_col) {
CUDA_KERNEL_LOOP(index, n) {
int w_out = index % width_col;
int h_index = index / width_col;
int h_out = h_index % height_col;
int channel_in = h_index / height_col;
- int channel_out = channel_in * ksize * ksize;
- int h_in = h_out * stride - pad;
- int w_in = w_out * stride - pad;
+ int channel_out = channel_in * kernel_h * kernel_w;
+ int h_in = h_out * stride_h - pad_h;
+ int w_in = w_out * stride_w - pad_w;
Dtype* data_col_ptr = data_col;
data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
const Dtype* data_im_ptr = data_im;
data_im_ptr += (channel_in * height + h_in) * width + w_in;
- for (int i = 0; i < ksize; ++i) {
- for (int j = 0; j < ksize; ++j) {
+ for (int i = 0; i < kernel_h; ++i) {
+ for (int j = 0; j < kernel_w; ++j) {
int h = h_in + i;
int w = w_in + j;
*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
@@ -41,17 +43,20 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
template <typename Dtype>
void im2col_gpu(const Dtype* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, Dtype* data_col) {
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ Dtype* data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
- int height_col = (height + 2 * pad - ksize) / stride + 1;
- int width_col = (width + 2 * pad - ksize) / stride + 1;
+ int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int num_kernels = channels * height_col * width_col;
// NOLINT_NEXT_LINE(whitespace/operators)
im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
CAFFE_CUDA_NUM_THREADS>>>(
- num_kernels, data_im, height, width, ksize, pad, stride, height_col,
+ num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+ pad_w, stride_h, stride_w, height_col,
width_col, data_col);
CUDA_POST_KERNEL_CHECK;
}
@@ -59,40 +64,46 @@ void im2col_gpu(const Dtype* data_im, const int channels,
// Explicit instantiation
template void im2col_gpu<float>(const float* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, float* data_col);
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ float* data_col);
template void im2col_gpu<double>(const double* data_im, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, double* data_col);
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ double* data_col);
template <typename Dtype>
__global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
- const int height, const int width, const int channels, const int ksize,
- const int pad, const int stride, const int height_col, const int width_col,
+ const int height, const int width, const int channels,
+ const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int height_col, const int width_col,
Dtype* data_im) {
CUDA_KERNEL_LOOP(index, n) {
Dtype val = 0;
- int w = index % width + pad;
- int h = (index / width) % height + pad;
+ int w = index % width + pad_w;
+ int h = (index / width) % height + pad_h;
int c = index / (width * height);
// compute the start and end of the output
- int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
- int w_col_end = min(w / stride + 1, width_col);
- int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
- int h_col_end = min(h / stride + 1, height_col);
+ int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+ int w_col_end = min(w / stride_w + 1, width_col);
+ int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+ int h_col_end = min(h / stride_h + 1, height_col);
/*
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out]
- int c_col = c * ksize * ksize + (h - h_col * stride) * ksize + (w - w_col * stride);
+ int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
+ + (w - w_col * stride_w);
val += data_col[(c_col * height_col + h_col) * width_col + w_col];
}
}
*/
// equivalent implementation
- int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
- int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
- int coeff_w_col = (1 - stride * height_col * width_col);
+ int offset = (c * patch_h * patch_w + h * patch_h + w) * height_col * width_col;
+ int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+ int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
@@ -104,29 +115,31 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
- const int height, const int width, const int ksize, const int pad,
- const int stride, Dtype* data_im) {
- int height_col = (height + 2 * pad - ksize) / stride + 1;
- int width_col = (width + 2 * pad - ksize) / stride + 1;
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, Dtype* data_im) {
+ int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
// NOLINT_NEXT_LINE(whitespace/operators)
col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
CAFFE_CUDA_NUM_THREADS>>>(
- num_kernels, data_col, height, width, channels, ksize, pad, stride,
+ num_kernels, data_col, height, width, channels, patch_h, patch_w,
+ pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im);
CUDA_POST_KERNEL_CHECK;
}
-
// Explicit instantiation
template void col2im_gpu<float>(const float* data_col, const int channels,
- const int height, const int width, const int psize, const int pad,
- const int stride, float* data_im);
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, float* data_im);
template void col2im_gpu<double>(const double* data_col, const int channels,
- const int height, const int width, const int psize, const int pad,
- const int stride, double* data_im);
-
+ const int height, const int width, const int patch_h, const int patch_w,
+ const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, double* data_im);
} // namespace caffe