rewrite ConvolutionLayer to use BaseConvolutionLayer helpers

author: Jonathan L Long <jonlong@cs.berkeley.edu> 2014-12-21 19:42:29 -0800
committer: Jonathan L Long <jonlong@cs.berkeley.edu> 2015-01-11 00:28:44 -0800
commit: e3e2f2d3139880f77355e6837e72ad6c2848b448 (patch)
tree: 0430b5a351a3f81f85912872392f1ceef89c1ac5 /src/caffe/layers/conv_layer.cu
parent: a0e9db1347c325ff007166e79d1ca693e2e5de18 (diff)
download: caffeonacl-e3e2f2d3139880f77355e6837e72ad6c2848b448.tar.gz
caffeonacl-e3e2f2d3139880f77355e6837e72ad6c2848b448.tar.bz2
caffeonacl-e3e2f2d3139880f77355e6837e72ad6c2848b448.zip
1 files changed, 23 insertions, 94 deletions
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index af14facb..3902fdf3 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -8,135 +8,64 @@
 
 namespace caffe {
 
-/// @brief refer to CPU forward -- the BLAS implementation is the same.
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
-    Dtype* col_buff = NULL;
-    if (!is_1x1_) {
-      col_buff = col_buffer_.mutable_gpu_data();
-    }
-    const Dtype* weight = this->blobs_[0]->gpu_data();
-    int weight_offset = M_ * K_;
-    int col_offset = K_ * N_;
-    int top_offset = M_ * N_;
-    for (int n = 0; n < num_; ++n) {
-      // im2col transformation: unroll input regions for filtering
-      // into column matrix for multplication.
-      if (!is_1x1_) {
-        im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-            width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-            col_buff);
-      } else {
-        col_buff = bottom[i]->mutable_gpu_data() + bottom[i]->offset(n);
-      }
-      // Take inner products for groups.
-      for (int g = 0; g < group_; ++g) {
-        caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-          (Dtype)1., weight + weight_offset * g, col_buff + col_offset * g,
-          (Dtype)0., top_data + top[i]->offset(n) + top_offset * g);
-      }
-      // Add bias.
-      if (bias_term_) {
-        caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-            N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(),
-            bias_multiplier_.gpu_data(),
-            (Dtype)1., top_data + top[i]->offset(n));
+    for (int n = 0; n < this->num_; ++n) {
+      this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
       }
     }
   }
 }
 
-/// @brief refer to CPU backward -- the BLAS implementation is the same.
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = NULL;
-  Dtype* weight_diff = NULL;
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
   if (this->param_propagate_down_[0]) {
-    weight = this->blobs_[0]->gpu_data();
-    weight_diff = this->blobs_[0]->mutable_gpu_diff();
     caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
   }
-  Dtype* bias_diff = NULL;
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
+  if (this->bias_term_ && this->param_propagate_down_[1]) {
+    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
+        this->blobs_[1]->mutable_gpu_diff());
   }
-  const int weight_offset = M_ * K_;
-  const int col_offset = K_ * N_;
-  const int top_offset = M_ * N_;
   for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = NULL;
+    const Dtype* top_diff = top[i]->gpu_diff();
     // Bias gradient, if necessary.
-    if (bias_term_ && this->param_propagate_down_[1]) {
-      top_diff = top[i]->gpu_diff();
-      for (int n = 0; n < num_; ++n) {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
-            1., top_diff + top[0]->offset(n),
-            bias_multiplier_.gpu_data(), 1.,
-            bias_diff);
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
-      if (!top_diff) {
-        top_diff = top[i]->gpu_diff();
-      }
-      Dtype* col_buff = NULL;
-      if (!is_1x1_) {
-        col_buff = col_buffer_.mutable_gpu_data();
-      }
       const Dtype* bottom_data = bottom[i]->gpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < num_; ++n) {
-        // Since we saved memory in the forward pass by not storing all col
-        // data, we will need to recompute them.
-        if (!is_1x1_) {
-          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                    width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                    stride_h_, stride_w_, col_buff);
-        } else {
-          col_buff = bottom[i]->mutable_gpu_data() + bottom[i]->offset(n);
-        }
+      for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_buff + col_offset * g, (Dtype)1.,
-                weight_diff + weight_offset * g);
-          }
+          this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n),
+              top_diff + top[i]->offset(n), weight_diff);
         }
-        // gradient w.r.t. bottom data, if necessary
+        // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          if (weight == NULL) {
-            weight = this->blobs_[0]->gpu_data();
-          }
-          if (is_1x1_) {
-            col_buff = bottom[i]->mutable_gpu_diff() + bottom[i]->offset(n);
-          }
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-                (Dtype)1., weight + weight_offset * g,
-                top_diff + top[i]->offset(n) + top_offset * g,
-                (Dtype)0., col_buff + col_offset * g);
-          }
-          // col2im back to the data
-          if (!is_1x1_) {
-            col2im_gpu(col_buff, channels_, height_, width_,
-                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-                bottom_diff + bottom[i]->offset(n));
-          }
+          this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n));
         }
       }
     }
   }
 }
 
-
 INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer);
 
 }  // namespace caffe
author	Jonathan L Long <jonlong@cs.berkeley.edu>	2014-12-21 19:42:29 -0800
committer	Jonathan L Long <jonlong@cs.berkeley.edu>	2015-01-11 00:28:44 -0800
commit	e3e2f2d3139880f77355e6837e72ad6c2848b448 (patch)
tree	0430b5a351a3f81f85912872392f1ceef89c1ac5 /src/caffe/layers/conv_layer.cu
parent	a0e9db1347c325ff007166e79d1ca693e2e5de18 (diff)
download	caffeonacl-e3e2f2d3139880f77355e6837e72ad6c2848b448.tar.gz caffeonacl-e3e2f2d3139880f77355e6837e72ad6c2848b448.tar.bz2 caffeonacl-e3e2f2d3139880f77355e6837e72ad6c2848b448.zip