diff options
-rw-r--r-- | include/caffe/util/im2col.hpp | 20 | ||||
-rw-r--r-- | include/caffe/vision_layers.hpp | 12 | ||||
-rw-r--r-- | src/caffe/layers/conv_layer.cpp | 58 | ||||
-rw-r--r-- | src/caffe/layers/conv_layer.cu | 11 | ||||
-rw-r--r-- | src/caffe/layers/im2col_layer.cpp | 51 | ||||
-rw-r--r-- | src/caffe/layers/im2col_layer.cu | 6 | ||||
-rw-r--r-- | src/caffe/test/test_im2col_kernel.cu | 17 | ||||
-rw-r--r-- | src/caffe/util/im2col.cpp | 62 | ||||
-rw-r--r-- | src/caffe/util/im2col.cu | 91 |
9 files changed, 216 insertions, 112 deletions
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index a649d8cc..c263997a 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -7,23 +7,27 @@ namespace caffe { template <typename Dtype> void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_col); template <typename Dtype> void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im); template <typename Dtype> void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_col); template <typename Dtype> void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, Dtype* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im); } // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index 05c7b45e..80dc3754 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -45,11 +45,11 @@ class ConvolutionLayer : public Layer<Dtype> { virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom); - int kernel_size_; - int stride_; + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; int num_; int channels_; - int pad_; + int pad_h_, pad_w_; int height_; int width_; int num_output_; @@ -119,12 +119,12 @@ class Im2colLayer : public Layer<Dtype> { virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom); - int kernel_size_; - int stride_; + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; int channels_; int height_; int width_; - int pad_; + int pad_h_, pad_w_; }; /* InnerProductLayer diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 11f29e78..79b040f0 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -14,10 +14,42 @@ template <typename Dtype> void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { Layer<Dtype>::SetUp(bottom, top); - kernel_size_ = this->layer_param_.convolution_param().kernel_size(); - stride_ = this->layer_param_.convolution_param().stride(); + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + CHECK(!conv_param.has_kernel_size() != + !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK(conv_param.has_kernel_size() || + (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + CHECK((!conv_param.has_pad() && conv_param.has_pad_h() + && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK((!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + if (conv_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = conv_param.kernel_size(); + } else { + kernel_h_ = conv_param.kernel_h(); + kernel_w_ = conv_param.kernel_w(); + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!conv_param.has_pad_h()) { + pad_h_ = pad_w_ = conv_param.pad(); + } else { + pad_h_ = conv_param.pad_h(); + pad_w_ = conv_param.pad_w(); + } + if (!conv_param.has_stride_h()) { + stride_h_ = stride_w_ = conv_param.stride(); + } else { + stride_h_ = conv_param.stride_h(); + stride_w_ = conv_param.stride_w(); + } group_ = this->layer_param_.convolution_param().group(); - pad_ = this->layer_param_.convolution_param().pad(); num_ = bottom[0]->num(); channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); @@ -37,17 +69,18 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom, CHECK_EQ(channels_ % group_, 0); // The im2col result buffer would only hold one image at a time to avoid // overly large memory usage. - int height_out = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1; - int width_out = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1; + int height_out = + (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1; + int width_out = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1; col_buffer_.Reshape( - 1, channels_ * kernel_size_ * kernel_size_, height_out, width_out); + 1, channels_ * kernel_h_ * kernel_w_, height_out, width_out); // Set the parameters CHECK_EQ(num_output_ % group_, 0) << "Number of output should be multiples of group."; bias_term_ = this->layer_param_.convolution_param().bias_term(); // Figure out the dimensions for individual gemms. M_ = num_output_ / group_; - K_ = channels_ * kernel_size_ * kernel_size_ / group_; + K_ = channels_ * kernel_h_ * kernel_w_ / group_; N_ = height_out * width_out; for (int top_id = 0; top_id < top->size(); ++top_id) { (*top)[top_id]->Reshape(num_, num_output_, height_out, width_out); @@ -63,7 +96,7 @@ void ConvolutionLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom, } // Intialize the weight this->blobs_[0].reset(new Blob<Dtype>( - num_output_, channels_ / group_, kernel_size_, kernel_size_)); + num_output_, channels_ / group_, kernel_h_, kernel_w_)); // fill the weights shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().weight_filler())); @@ -99,7 +132,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, for (int n = 0; n < num_; ++n) { // First, im2col im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data); + stride_h_, stride_w_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, @@ -160,7 +194,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, // Since we saved memory in the forward pass by not storing all col // data, we will need to recompute them. im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + width_, kernel_h_, kernel_w_ pad_h_, pad_w_, stride_h_, stride_w_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { for (int g = 0; g < group_; ++g) { @@ -179,8 +213,8 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, (Dtype)0., col_diff + col_offset * g); } // col2im back to the data - col2im_cpu(col_diff, channels_, height_, width_, kernel_size_, pad_, - stride_, bottom_diff + (*bottom)[i]->offset(n)); + col2im_cpu(col_diff, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n)); } } } diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu index d328fbde..86c047b4 100644 --- a/src/caffe/layers/conv_layer.cu +++ b/src/caffe/layers/conv_layer.cu @@ -24,7 +24,8 @@ Dtype ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, for (int n = 0; n < num_; ++n) { // First, im2col im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data); + pad_w_, stride_h_, stride_w_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, @@ -65,6 +66,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const Dtype* top_diff = NULL; // Bias gradient, if necessary. if (bias_term_ && this->param_propagate_down_[1]) { + width_, kernel_h_, kernel_w_, pad_h_, top_diff = top[i]->gpu_diff(); for (int n = 0; n < num_; ++n) { caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, @@ -85,7 +87,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, // Since we saved memory in the forward pass by not storing all col // data, we will need to recompute them. im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, col_data); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { for (int g = 0; g < group_; ++g) { @@ -104,8 +107,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, (Dtype)0., col_diff + col_offset * g); } // col2im back to the data - col2im_gpu(col_diff, channels_, height_, width_, kernel_size_, pad_, - stride_, bottom_diff + (*bottom)[i]->offset(n)); + col2im_gpu(col_diff, channels_, height_, width_, kernel_h_, kernel_w, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n)); } } } diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 2c86412b..fabce0a6 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -13,15 +13,48 @@ template <typename Dtype> void Im2colLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { Layer<Dtype>::SetUp(bottom, top); - kernel_size_ = this->layer_param_.convolution_param().kernel_size(); - stride_ = this->layer_param_.convolution_param().stride(); - pad_ = this->layer_param_.convolution_param().pad(); + ConvolutionParameter conv_param = this->layer_param_.convolution_param(); + CHECK(!conv_param.has_kernel_size() != + !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK(conv_param.has_kernel_size() || + (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; + CHECK((!conv_param.has_pad() && conv_param.has_pad_h() + && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + << "pad is pad OR pad_h and pad_w are required."; + CHECK((!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + << "Stride is stride OR stride_h and stride_w are required."; + if (conv_param.has_kernel_size()) { + kernel_h_ = kernel_w_ = conv_param.kernel_size(); + } else { + kernel_h_ = conv_param.kernel_h(); + kernel_w_ = conv_param.kernel_w(); + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + if (!conv_param.has_pad_h()) { + pad_h_ = pad_w_ = conv_param.pad(); + } else { + pad_h_ = conv_param.pad_h(); + pad_w_ = conv_param.pad_w(); + } + if (!conv_param.has_stride_h()) { + stride_h_ = stride_w_ = conv_param.stride(); + } else { + stride_h_ = conv_param.stride_h(); + stride_w_ = conv_param.stride_w(); + } channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); - (*top)[0]->Reshape(bottom[0]->num(), channels_ * kernel_size_ * kernel_size_, - (height_ + 2 * pad_ - kernel_size_) / stride_ + 1, - (width_ + 2 * pad_ - kernel_size_) / stride_ + 1); + (*top)[0]->Reshape( + bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, + (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, + (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); } template <typename Dtype> @@ -31,7 +64,8 @@ Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, Dtype* top_data = (*top)[0]->mutable_cpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data + (*top)[0]->offset(n)); } return Dtype(0.); } @@ -43,7 +77,8 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu index 9cfb74e8..84439742 100644 --- a/src/caffe/layers/im2col_layer.cu +++ b/src/caffe/layers/im2col_layer.cu @@ -16,7 +16,8 @@ Dtype Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, Dtype* top_data = (*top)[0]->mutable_gpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); + width_, kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, top_data + (*top)[0]->offset(n)); } return Dtype(0.); } @@ -28,7 +29,8 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); + kernel_h_, kernel_w_, pad_h_, pad_w_, + stride_h_, stride_w_, bottom_diff + (*bottom)[0]->offset(n)); } } diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu index 5671968b..37d1a152 100644 --- a/src/caffe/test/test_im2col_kernel.cu +++ b/src/caffe/test/test_im2col_kernel.cu @@ -17,8 +17,10 @@ namespace caffe { // Forward declare kernel functions template <typename Dtype> __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, - const int height, const int width, const int ksize, const int pad, - const int stride, const int height_col, const int width_col, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, Dtype* data_col); extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; @@ -87,8 +89,10 @@ TYPED_TEST(Im2colKernelTest, TestGPU) { // CPU Version for (int n = 0; n < this->blob_bottom_->num(); ++n) { im2col_cpu(this->blob_bottom_->cpu_data() + this->blob_bottom_->offset(n), - this->channels_, this->height_, this->width_, this->kernel_size_, - this->pad_, this->stride_, cpu_data + this->blob_top_cpu_->offset(n)); + this->channels_, this->height_, this->width_, + this->kernel_size_, this->kernel_size_, this->pad_, this->pad_, + this->stride_, this->stride_, + cpu_data + this->blob_top_cpu_->offset(n)); } // GPU version @@ -102,8 +106,9 @@ TYPED_TEST(Im2colKernelTest, TestGPU) { // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<TypeParam><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>( num_kernels, bottom_data + this->blob_bottom_->offset(n), - this->height_, this->width_, this->kernel_size_, this->pad_, - this->stride_, this->height_col_, this->width_col_, + this->height_, this->width_, this->kernel_size_, this->kernel_size_, + this->pad_, this->pad_, this->stride_, this->stride_, + this->height_col_, this->width_col_, top_data + this->blob_top_->offset(n)); CUDA_POST_KERNEL_CHECK; } diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index ce4e1884..ca1dc797 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -11,19 +11,21 @@ namespace caffe { template <typename Dtype> void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col) { - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int channels_col = channels * ksize * ksize; + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; for (int c = 0; c < channels_col; ++c) { - int w_offset = c % ksize; - int h_offset = (c / ksize) % ksize; - int c_im = c / ksize / ksize; + int w_offset = c % kernel_h; + int h_offset = (c / kernel_h) % kernel_h; + int c_im = c / kernel_h / kernel_w; for (int h = 0; h < height_col; ++h) { for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride - pad + h_offset; - int w_pad = w * stride - pad + w_offset; + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_col[(c * height_col + h) * width_col + w] = data_im[(c_im * height + h_pad) * width + w_pad]; @@ -36,28 +38,32 @@ void im2col_cpu(const Dtype* data_im, const int channels, // Explicit instantiation template void im2col_cpu<float>(const float* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_col); template void im2col_cpu<double>(const double* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_col); template <typename Dtype> void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int ksize, const int pad, + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int stride, Dtype* data_im) { caffe_set(height * width * channels, Dtype(0), data_im); - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; - int channels_col = channels * ksize * ksize; + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int channels_col = channels * patch_h * patch_w; for (int c = 0; c < channels_col; ++c) { - int w_offset = c % ksize; - int h_offset = (c / ksize) % ksize; - int c_im = c / ksize / ksize; + int w_offset = c % patch_h; + int h_offset = (c / patch_h) % patch_h; + int c_im = c / patch_h / patch_w; for (int h = 0; h < height_col; ++h) { for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride - pad + h_offset; - int w_pad = w * stride - pad + w_offset; + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_im[(c_im * height + h_pad) * width + w_pad] += data_col[(c * height_col + h) * width_col + w]; @@ -68,10 +74,12 @@ void col2im_cpu(const Dtype* data_col, const int channels, // Explicit instantiation template void col2im_cpu<float>(const float* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); template void col2im_cpu<double>(const double* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); } // namespace caffe diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu index 79faa6cb..b565d2d3 100644 --- a/src/caffe/util/im2col.cu +++ b/src/caffe/util/im2col.cu @@ -12,23 +12,25 @@ namespace caffe { template <typename Dtype> __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, - const int height, const int width, const int ksize, const int pad, - const int stride, const int height_col, const int width_col, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, Dtype* data_col) { CUDA_KERNEL_LOOP(index, n) { int w_out = index % width_col; int h_index = index / width_col; int h_out = h_index % height_col; int channel_in = h_index / height_col; - int channel_out = channel_in * ksize * ksize; - int h_in = h_out * stride - pad; - int w_in = w_out * stride - pad; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; Dtype* data_col_ptr = data_col; data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; const Dtype* data_im_ptr = data_im; data_im_ptr += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < ksize; ++i) { - for (int j = 0; j < ksize; ++j) { + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? @@ -41,17 +43,20 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, template <typename Dtype> void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_col) { + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int num_kernels = channels * height_col * width_col; // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>( - num_kernels, data_im, height, width, ksize, pad, stride, height_col, + num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, + pad_w, stride_h, stride_w, height_col, width_col, data_col); CUDA_POST_KERNEL_CHECK; } @@ -59,40 +64,46 @@ void im2col_gpu(const Dtype* data_im, const int channels, // Explicit instantiation template void im2col_gpu<float>(const float* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, float* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); template void im2col_gpu<double>(const double* data_im, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, double* data_col); + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); template <typename Dtype> __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, - const int height, const int width, const int channels, const int ksize, - const int pad, const int stride, const int height_col, const int width_col, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, Dtype* data_im) { CUDA_KERNEL_LOOP(index, n) { Dtype val = 0; - int w = index % width + pad; - int h = (index / width) % height + pad; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; int c = index / (width * height); // compute the start and end of the output - int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; - int w_col_end = min(w / stride + 1, width_col); - int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; - int h_col_end = min(h / stride + 1, height_col); + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); /* for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { // the col location: [c * width * height + h_out, w_out] - int c_col = c * ksize * ksize + (h - h_col * stride) * ksize + (w - w_col * stride); + int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize + + (w - w_col * stride_w); val += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } */ // equivalent implementation - int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col; - int coeff_h_col = (1 - stride * ksize * height_col) * width_col; - int coeff_w_col = (1 - stride * height_col * width_col); + int offset = (c * patch_h * patch_w + h * patch_h + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; @@ -104,29 +115,31 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, template <typename Dtype> void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int ksize, const int pad, - const int stride, Dtype* data_im) { - int height_col = (height + 2 * pad - ksize) / stride + 1; - int width_col = (width + 2 * pad - ksize) / stride + 1; + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, Dtype* data_im) { + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int num_kernels = channels * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. // NOLINT_NEXT_LINE(whitespace/operators) col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>( - num_kernels, data_col, height, width, channels, ksize, pad, stride, + num_kernels, data_col, height, width, channels, patch_h, patch_w, + pad_h, pad_w, stride_h, stride_w, height_col, width_col, data_im); CUDA_POST_KERNEL_CHECK; } - // Explicit instantiation template void col2im_gpu<float>(const float* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, float* data_im); + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im); template void col2im_gpu<double>(const double* data_col, const int channels, - const int height, const int width, const int psize, const int pad, - const int stride, double* data_im); - + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im); } // namespace caffe |