#include #include "caffe/common.hpp" #include "caffe/util/im2col.hpp" namespace caffe { template __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, Dtype* data_col) { CUDA_KERNEL_LOOP(index, n) { const int h_index = index / width_col; const int h_col = h_index % height_col; const int w_col = index % width_col; const int c_im = h_index / height_col; const int c_col = c_im * kernel_h * kernel_w; const int h_offset = h_col * stride_h - pad_h; const int w_offset = w_col * stride_w - pad_w; Dtype* data_col_ptr = data_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; const Dtype* data_im_ptr = data_im; data_im_ptr += (c_im * height + h_offset) * width + w_offset; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { int h_im = h_offset + i; int w_im = w_offset + j; *data_col_ptr = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * width + j] : 0; data_col_ptr += height_col * width_col; } } } } template void im2col_gpu(const Dtype* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int num_kernels = channels * height_col * width_col; // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<<>>( num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, data_col); CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void im2col_gpu(const float* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_col); template void im2col_gpu(const double* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col); template __global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, Dtype* data_col) { int d_temp[num_axes]; // NOLINT(runtime/arrays) int d_iter[num_axes]; // NOLINT(runtime/arrays) int i; CUDA_KERNEL_LOOP(index, n) { // Initialize channel_in, computed in the loop below, with intermediate // computations used to compute the spatial indices. int channel_in = index; int channel_out = 1; for (i = num_axes - 1; i >= 0; --i) { d_temp[i] = channel_in % col_shape[i + 1]; channel_in /= col_shape[i + 1]; channel_out *= kernel_shape[i]; } channel_out *= channel_in; int data_col_inc = 1; for (i = 0; i < num_axes; ++i) { channel_out *= col_shape[i + 1]; channel_out += d_temp[i]; d_temp[i] = d_temp[i] * stride[i] - pad[i]; channel_in *= im_shape[i + 1]; channel_in += d_temp[i]; data_col_inc *= col_shape[i + 1]; d_iter[i] = 0; } Dtype* data_col_ptr = data_col + channel_out; const Dtype* data_im_ptr = data_im + channel_in; bool incremented; do { bool in_range = true; for (i = 0; i < num_axes; ++i) { const int d_iter_im = d_iter[i] + d_temp[i]; in_range &= d_iter_im >= 0 && d_iter_im < im_shape[i + 1]; if (!in_range) { break; } } if (in_range) { int data_im_offset = d_iter[0]; for (i = 1; i < num_axes; ++i) { data_im_offset *= im_shape[i + 1]; data_im_offset += d_iter[i]; } *data_col_ptr = data_im_ptr[data_im_offset]; } else { *data_col_ptr = 0; } data_col_ptr += data_col_inc; incremented = false; for (i = num_axes - 1; i >= 0; --i) { const int d_max = kernel_shape[i]; if (d_iter[i] == d_max - 1) { d_iter[i] = 0; } else { // d_iter[i] < d_max - 1 ++d_iter[i]; incremented = true; break; } } // for (int i = num_axes - 1; i >= 0; --i) } while (incremented); // do } // CUDA_KERNEL_LOOP(index, n) } template void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes, const int num_kernels, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, Dtype* data_col) { switch (num_spatial_axes) { case 1: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 2: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 3: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 4: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 5: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 6: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 7: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 8: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 9: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; case 10: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( num_kernels, data_im, im_shape, col_shape, kernel_shape, pad, stride, data_col); break; default: LOG(FATAL) << "im2col_nd_gpu does not support computation with " << num_spatial_axes << " spatial axes"; } CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void im2col_nd_gpu(const float* data_im, const int num_spatial_axes, const int col_size, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, float* data_col); template void im2col_nd_gpu(const double* data_im, const int num_spatial_axes, const int col_size, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, double* data_col); template __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, const int height, const int width, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, Dtype* data_im) { CUDA_KERNEL_LOOP(index, n) { Dtype val = 0; const int w_im = index % width + pad_w; const int h_im = (index / width) % height + pad_h; const int c_im = index / (width * height); // compute the start and end of the output const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1; const int w_col_end = min(w_im / stride_w + 1, width_col); const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1; const int h_col_end = min(h_im / stride_h + 1, height_col); /* for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { // the col location: [c * width * height + h_out, w_out] int c_col = c_im * kernel_h * kernel_w + (h_im - h_col * stride_h) * kernel_w + (w_im - w_col * stride_w); val += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } */ // equivalent implementation int offset = (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col; int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col; int coeff_w_col = (1 - stride_w * height_col * width_col); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; } } data_im[index] = val; } } template void col2im_gpu(const Dtype* data_col, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int num_kernels = channels * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. // NOLINT_NEXT_LINE(whitespace/operators) col2im_gpu_kernel<<>>( num_kernels, data_col, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, data_im); CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void col2im_gpu(const float* data_col, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_im); template void col2im_gpu(const double* data_col, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im); template __global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, Dtype* data_im) { int d_im[num_axes]; // NOLINT(runtime/arrays) int d_col_iter[num_axes]; // NOLINT(runtime/arrays) int d_col_start[num_axes]; // NOLINT(runtime/arrays) int d_col_end[num_axes]; // NOLINT(runtime/arrays) CUDA_KERNEL_LOOP(index, n) { // Initialize channel_in, computed in the loop below, with intermediate // computations used to compute the spatial indices. int c_im = index; // Calculate d_im (image dimensions). for (int i = num_axes - 1; i >= 0; --i) { d_im[i] = c_im % im_shape[i + 1] + pad[i]; c_im /= im_shape[i + 1]; } // Calculate col start/end indices. bool done = false; for (int i = 0; i < num_axes; ++i) { d_col_start[i] = d_col_iter[i] = (d_im[i] < kernel_shape[i]) ? 0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1; d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]); if (d_col_start[i] >= d_col_end[i]) { // Skip computation if the dimension is 0 at any spatial axis -- // final val will be 0. data_im[index] = 0; done = true; break; // for (int i = 0; i < num_axes; ++i) } } if (done) { continue; // CUDA_KERNEL_LOOP(index, n) } // Loop over the col to compute the output val. Dtype val = 0; bool incremented = true; do { // Compute the final offset. int final_offset = 0; int kernel_shape_prod = 1; for (int i = num_axes - 1; i >= 0; --i) { final_offset += (d_im[i] - d_col_iter[i] * stride[i]) * kernel_shape_prod; kernel_shape_prod *= kernel_shape[i]; } final_offset += kernel_shape_prod * c_im; for (int i = 0; i < num_axes; ++i) { final_offset *= col_shape[i + 1]; final_offset += d_col_iter[i]; } val += data_col[final_offset]; incremented = false; for (int i = num_axes - 1; i >= 0; --i) { const int d_max = d_col_end[i]; if (d_col_iter[i] == d_max - 1) { d_col_iter[i] = d_col_start[i]; } else { // d_col_iter[i] < d_max - 1 ++d_col_iter[i]; incremented = true; break; // for (int i = num_axes - 1; i >= 0; --i) } } // for (int i = num_axes - 1; i >= 0; --i) } while (incremented); data_im[index] = val; } // CUDA_KERNEL_LOOP(index, n) } template void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes, const int im_size, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, Dtype* data_im) { switch (num_spatial_axes) { case 1: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 2: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 3: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 4: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 5: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 6: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 7: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 8: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 9: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; case 10: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) <<>>( im_size, data_col, im_shape, col_shape, kernel_shape, pad, stride, data_im); break; default: LOG(FATAL) << "col2im_nd_gpu does not support computation with " << num_spatial_axes << " spatial axes"; } CUDA_POST_KERNEL_CHECK; } // Explicit instantiation template void col2im_nd_gpu(const float* data_col, const int num_spatial_axes, const int im_size, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, float* data_im); template void col2im_nd_gpu(const double* data_col, const int num_spatial_axes, const int im_size, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, double* data_im); } // namespace caffe