diff options
author | Ronghang Hu <huronghang@hotmail.com> | 2015-10-19 11:48:19 -0700 |
---|---|---|
committer | Ronghang Hu <huronghang@hotmail.com> | 2015-10-19 11:48:19 -0700 |
commit | ee1adfb5019a35493e6c60571058550f7a5c6bfd (patch) | |
tree | 4b2c001e7388ce0c2ec94784998be9f2a00ae785 /src/caffe | |
parent | bfdafb0e8ecc6c3f717897a13aa165b5354eb5be (diff) | |
parent | 4c93b3dc555891ae0ad75092b6c0f77508740ecf (diff) | |
download | caffeonacl-ee1adfb5019a35493e6c60571058550f7a5c6bfd.tar.gz caffeonacl-ee1adfb5019a35493e6c60571058550f7a5c6bfd.tar.bz2 caffeonacl-ee1adfb5019a35493e6c60571058550f7a5c6bfd.zip |
Merge pull request #3185 from mausoomsarkar/master
PReLU speed up
Diffstat (limited to 'src/caffe')
-rw-r--r-- | src/caffe/layers/prelu_layer.cu | 44 |
1 files changed, 22 insertions, 22 deletions
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu index e1f20048..1225334f 100644 --- a/src/caffe/layers/prelu_layer.cu +++ b/src/caffe/layers/prelu_layer.cu @@ -31,10 +31,15 @@ __global__ void PReLUBackward(const int n, const int channels, const int dim, // CUDA kernel for element-wise parameter backward template <typename Dtype> -__global__ void PReLUParamBackward(const int n, const Dtype* in_diff, +__global__ void PReLUParamBackward(const int n, + const int rows, const int rowPitch, const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff) { CUDA_KERNEL_LOOP(index, n) { out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); + for ( int k = 1; k < rows; k++ ) { + out_diff[index] += in_diff[index + k*rowPitch] + * in_data[index + k*rowPitch] * (in_data[index + k*rowPitch] <= 0); + } } } @@ -82,29 +87,24 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, if (this->param_propagate_down_[0]) { Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); int cdim = channels * dim; - Dtype dsum = 0.; - for (int n = 0; n < bottom[0]->num(); ++n) { - // compute element-wise diff - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim), - CAFFE_CUDA_NUM_THREADS>>>( - cdim, top_diff + top[0]->offset(n), - bottom_data + bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - if (channel_shared_) { - Dtype d; - caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); - dsum += d; - } else { - caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); - } - } + + // compute element-wise diff + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim), + CAFFE_CUDA_NUM_THREADS>>>( + cdim, bottom[0]->num(), top[0]->offset(1), top_diff , + bottom_data , + backward_buff_.mutable_gpu_diff()); + CUDA_POST_KERNEL_CHECK; if (channel_shared_) { + Dtype dsum; + caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(), + multiplier_.gpu_data(), &dsum); caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); + } else { + caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1., + backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., + slope_diff); } } // Propagate to bottom |