diff options
author | Evan Shelhamer <shelhamer@imaginarynumber.net> | 2015-01-25 23:05:33 -0800 |
---|---|---|
committer | Evan Shelhamer <shelhamer@imaginarynumber.net> | 2015-01-25 23:05:33 -0800 |
commit | eb107449c46e92be2045d4b84ac06a77bab0ce23 (patch) | |
tree | e109fc3b4221bd7d03885d8434797927fec2ee76 | |
parent | 47e8831ac09c82fb210e8d032a2f770499619540 (diff) | |
parent | a0ada4fc9e0075d8ba57f128a111fd5e214266c8 (diff) | |
download | caffeonacl-eb107449c46e92be2045d4b84ac06a77bab0ce23.tar.gz caffeonacl-eb107449c46e92be2045d4b84ac06a77bab0ce23.tar.bz2 caffeonacl-eb107449c46e92be2045d4b84ac06a77bab0ce23.zip |
Merge pull request #1755 from jeffdonahue/softmax-optimization
SoftmaxLayer GPU optimization
-rw-r--r-- | src/caffe/layers/softmax_layer.cu | 48 |
1 files changed, 24 insertions, 24 deletions
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu index 292ad2b3..6b8871a0 100644 --- a/src/caffe/layers/softmax_layer.cu +++ b/src/caffe/layers/softmax_layer.cu @@ -25,14 +25,13 @@ __global__ void kernel_channel_max(const int num, const int channels, } template <typename Dtype> -__global__ void kernel_channel_subtract(const int num, const int channels, - const int spatial_dim, Dtype* data, const Dtype* channel_max) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; +__global__ void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data) { + CUDA_KERNEL_LOOP(index, count) { + int n = index / channels / spatial_dim; int s = index % spatial_dim; - for (int c = 0; c < channels; ++c) { - data[(n * channels + c) * spatial_dim + s] -= channel_max[index]; - } + data[index] -= channel_max[n * spatial_dim + s]; } } @@ -58,14 +57,13 @@ __global__ void kernel_channel_sum(const int num, const int channels, } template <typename Dtype> -__global__ void kernel_channel_div(const int num, const int channels, - const int spatial_dim, Dtype* data, const Dtype* channel_sum) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; +__global__ void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, const Dtype* channel_sum, Dtype* data) { + CUDA_KERNEL_LOOP(index, count) { + int n = index / channels / spatial_dim; int s = index % spatial_dim; - for (int c = 0; c < channels; ++c) { - data[(n * channels + c) * spatial_dim + s] /= channel_sum[index]; - } + data[index] /= channel_sum[n * spatial_dim + s]; } } @@ -91,10 +89,11 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); Dtype* scale_data = scale_.mutable_gpu_data(); + int count = bottom[0]->count(); int num = bottom[0]->num(); int channels = bottom[0]->channels(); int spatial_dim = bottom[0]->height() * bottom[0]->width(); - caffe_copy(bottom[0]->count(), bottom_data, top_data); + caffe_copy(count, bottom_data, top_data); // We need to subtract the max to avoid numerical issues, compute the exp, // and then normalize. // compute max @@ -104,9 +103,9 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, scale_data); // subtract // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(num * spatial_dim), - CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, top_data, - scale_data); + kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count), + CAFFE_CUDA_NUM_THREADS>>>(count, num, channels, spatial_dim, + scale_data, top_data); // exponentiate // NOLINT_NEXT_LINE(whitespace/operators) kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(num * channels * spatial_dim), @@ -119,9 +118,9 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, scale_data); // divide // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(num * spatial_dim), - CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, top_data, - scale_data); + kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count), + CAFFE_CUDA_NUM_THREADS>>>(count, num, channels, spatial_dim, + scale_data, top_data); } template <typename Dtype> @@ -131,6 +130,7 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const Dtype* top_data = top[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); Dtype* scale_data = scale_.mutable_gpu_data(); + int count = top[0]->count(); int num = top[0]->num(); int channels = top[0]->channels(); int spatial_dim = top[0]->height() * top[0]->width(); @@ -141,9 +141,9 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, top_diff, top_data, scale_data); // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(num * spatial_dim), - CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, bottom_diff, - scale_data); + kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count), + CAFFE_CUDA_NUM_THREADS>>>(count, num, channels, spatial_dim, + scale_data, bottom_diff); // elementwise multiplication caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff); } |