summaryrefslogtreecommitdiff
path: root/src/caffe
diff options
context:
space:
mode:
authorRonghang Hu <huronghang@hotmail.com>2015-10-19 11:48:19 -0700
committerRonghang Hu <huronghang@hotmail.com>2015-10-19 11:48:19 -0700
commitee1adfb5019a35493e6c60571058550f7a5c6bfd (patch)
tree4b2c001e7388ce0c2ec94784998be9f2a00ae785 /src/caffe
parentbfdafb0e8ecc6c3f717897a13aa165b5354eb5be (diff)
parent4c93b3dc555891ae0ad75092b6c0f77508740ecf (diff)
downloadcaffeonacl-ee1adfb5019a35493e6c60571058550f7a5c6bfd.tar.gz
caffeonacl-ee1adfb5019a35493e6c60571058550f7a5c6bfd.tar.bz2
caffeonacl-ee1adfb5019a35493e6c60571058550f7a5c6bfd.zip
Merge pull request #3185 from mausoomsarkar/master
PReLU speed up
Diffstat (limited to 'src/caffe')
-rw-r--r--src/caffe/layers/prelu_layer.cu44
1 files changed, 22 insertions, 22 deletions
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu
index e1f20048..1225334f 100644
--- a/src/caffe/layers/prelu_layer.cu
+++ b/src/caffe/layers/prelu_layer.cu
@@ -31,10 +31,15 @@ __global__ void PReLUBackward(const int n, const int channels, const int dim,
// CUDA kernel for element-wise parameter backward
template <typename Dtype>
-__global__ void PReLUParamBackward(const int n, const Dtype* in_diff,
+__global__ void PReLUParamBackward(const int n,
+ const int rows, const int rowPitch, const Dtype* in_diff,
const Dtype* in_data, Dtype* out_diff) {
CUDA_KERNEL_LOOP(index, n) {
out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+ for ( int k = 1; k < rows; k++ ) {
+ out_diff[index] += in_diff[index + k*rowPitch]
+ * in_data[index + k*rowPitch] * (in_data[index + k*rowPitch] <= 0);
+ }
}
}
@@ -82,29 +87,24 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
if (this->param_propagate_down_[0]) {
Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
int cdim = channels * dim;
- Dtype dsum = 0.;
- for (int n = 0; n < bottom[0]->num(); ++n) {
- // compute element-wise diff
- // NOLINT_NEXT_LINE(whitespace/operators)
- PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
- CAFFE_CUDA_NUM_THREADS>>>(
- cdim, top_diff + top[0]->offset(n),
- bottom_data + bottom[0]->offset(n),
- backward_buff_.mutable_gpu_diff());
- CUDA_POST_KERNEL_CHECK;
- if (channel_shared_) {
- Dtype d;
- caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
- multiplier_.gpu_data(), &d);
- dsum += d;
- } else {
- caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
- backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
- slope_diff);
- }
- }
+
+ // compute element-wise diff
+ // NOLINT_NEXT_LINE(whitespace/operators)
+ PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
+ CAFFE_CUDA_NUM_THREADS>>>(
+ cdim, bottom[0]->num(), top[0]->offset(1), top_diff ,
+ bottom_data ,
+ backward_buff_.mutable_gpu_diff());
+ CUDA_POST_KERNEL_CHECK;
if (channel_shared_) {
+ Dtype dsum;
+ caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
+ multiplier_.gpu_data(), &dsum);
caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+ } else {
+ caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
+ backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
+ slope_diff);
}
}
// Propagate to bottom