darknet  v3
im2col_kernels.cu
Go to the documentation of this file.
1 #include "cuda_runtime.h"
2 #include "curand.h"
3 #include "cublas_v2.h"
4 
5 extern "C" {
6 #include "im2col.h"
7 #include "cuda.h"
8 }
9 
10 // src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
11 // You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE
12 
13 __global__ void im2col_gpu_kernel(const int n, const float* data_im,
14  const int height, const int width, const int ksize,
15  const int pad,
16  const int stride,
17  const int height_col, const int width_col,
18  float *data_col) {
19  int index = blockIdx.x*blockDim.x+threadIdx.x;
20  for(; index < n; index += blockDim.x*gridDim.x){
21  int w_out = index % width_col;
22  int h_index = index / width_col;
23  int h_out = h_index % height_col;
24  int channel_in = h_index / height_col;
25  int channel_out = channel_in * ksize * ksize;
26  int h_in = h_out * stride - pad;
27  int w_in = w_out * stride - pad;
28  float* data_col_ptr = data_col;
29  data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
30  const float* data_im_ptr = data_im;
31  data_im_ptr += (channel_in * height + h_in) * width + w_in;
32  for (int i = 0; i < ksize; ++i) {
33  for (int j = 0; j < ksize; ++j) {
34  int h = h_in + i;
35  int w = w_in + j;
36 
37  *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
38  data_im_ptr[i * width + j] : 0;
39 
40  //*data_col_ptr = data_im_ptr[ii * width + jj];
41 
42  data_col_ptr += height_col * width_col;
43  }
44  }
45  }
46 }
47 
48 void im2col_gpu(float *im,
49  int channels, int height, int width,
50  int ksize, int stride, int pad, float *data_col){
51  // We are going to launch channels * height_col * width_col kernels, each
52  // kernel responsible for copying a single-channel grid.
53  int height_col = (height + 2 * pad - ksize) / stride + 1;
54  int width_col = (width + 2 * pad - ksize) / stride + 1;
55  int num_kernels = channels * height_col * width_col;
56  im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
57  BLOCK>>>(
58  num_kernels, im, height, width, ksize, pad,
59  stride, height_col,
60  width_col, data_col);
61 }
__global__ void im2col_gpu_kernel(const int n, const float *data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float *data_col)
void im2col_gpu(float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col)