summaryrefslogtreecommitdiff
path: root/src/caffe
diff options
context:
space:
mode:
authorMausoom Sarkar <msarkar@adobe.com>2015-10-13 18:35:32 +0530
committerMausoom Sarkar <msarkar@adobe.com>2015-10-19 13:51:40 +0530
commit4c93b3dc555891ae0ad75092b6c0f77508740ecf (patch)
treec24e85107d4ff5b62fdef45a4471580a82a9669c /src/caffe
parent0151742509c23b927b0159230ac70399ac4f8967 (diff)
downloadcaffeonacl-4c93b3dc555891ae0ad75092b6c0f77508740ecf.tar.gz
caffeonacl-4c93b3dc555891ae0ad75092b6c0f77508740ecf.tar.bz2
caffeonacl-4c93b3dc555891ae0ad75092b6c0f77508740ecf.zip
Moved the loop inside PReLUParamBackward to do the reduction inside the kernel
Now PReLU backward is taking the same time as forward Code cleanup Removed unnecessary code Fixed indent merge if(channed_shared_)
Diffstat (limited to 'src/caffe')
-rw-r--r--src/caffe/layers/prelu_layer.cu44
1 files changed, 22 insertions, 22 deletions
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu
index e1f20048..1225334f 100644
--- a/src/caffe/layers/prelu_layer.cu
+++ b/src/caffe/layers/prelu_layer.cu
@@ -31,10 +31,15 @@ __global__ void PReLUBackward(const int n, const int channels, const int dim,
// CUDA kernel for element-wise parameter backward
template <typename Dtype>
-__global__ void PReLUParamBackward(const int n, const Dtype* in_diff,
+__global__ void PReLUParamBackward(const int n,
+ const int rows, const int rowPitch, const Dtype* in_diff,
const Dtype* in_data, Dtype* out_diff) {
CUDA_KERNEL_LOOP(index, n) {
out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+ for ( int k = 1; k < rows; k++ ) {
+ out_diff[index] += in_diff[index + k*rowPitch]
+ * in_data[index + k*rowPitch] * (in_data[index + k*rowPitch] <= 0);
+ }
}
}
@@ -82,29 +87,24 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
if (this->param_propagate_down_[0]) {
Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
int cdim = channels * dim;
- Dtype dsum = 0.;
- for (int n = 0; n < bottom[0]->num(); ++n) {
- // compute element-wise diff
- // NOLINT_NEXT_LINE(whitespace/operators)
- PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
- CAFFE_CUDA_NUM_THREADS>>>(
- cdim, top_diff + top[0]->offset(n),
- bottom_data + bottom[0]->offset(n),
- backward_buff_.mutable_gpu_diff());
- CUDA_POST_KERNEL_CHECK;
- if (channel_shared_) {
- Dtype d;
- caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
- multiplier_.gpu_data(), &d);
- dsum += d;
- } else {
- caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
- backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
- slope_diff);
- }
- }
+
+ // compute element-wise diff
+ // NOLINT_NEXT_LINE(whitespace/operators)
+ PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
+ CAFFE_CUDA_NUM_THREADS>>>(
+ cdim, bottom[0]->num(), top[0]->offset(1), top_diff ,
+ bottom_data ,
+ backward_buff_.mutable_gpu_diff());
+ CUDA_POST_KERNEL_CHECK;
if (channel_shared_) {
+ Dtype dsum;
+ caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
+ multiplier_.gpu_data(), &dsum);
caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+ } else {
+ caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
+ backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
+ slope_diff);
}
}
// Propagate to bottom