darknet  v3
convolutional_kernels.cu
Go to the documentation of this file.
1 #include "cuda_runtime.h"
2 #include "curand.h"
3 #include "cublas_v2.h"
4 
5 extern "C" {
6 #include "convolutional_layer.h"
7 #include "batchnorm_layer.h"
8 #include "gemm.h"
9 #include "blas.h"
10 #include "im2col.h"
11 #include "col2im.h"
12 #include "utils.h"
13 #include "cuda.h"
14 }
15 
16 __global__ void binarize_kernel(float *x, int n, float *binary)
17 {
18  int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
19  if (i >= n) return;
20  binary[i] = (x[i] >= 0) ? 1 : -1;
21 }
22 
23 void binarize_gpu(float *x, int n, float *binary)
24 {
25  binarize_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, binary);
26  check_error(cudaPeekAtLastError());
27 }
28 
29 __global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
30 {
31  int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
32  if (s >= size) return;
33  int i = 0;
34  float mean = 0;
35  for(i = 0; i < n; ++i){
36  mean += fabsf(input[i*size + s]);
37  }
38  mean = mean / n;
39  for(i = 0; i < n; ++i){
40  binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
41  }
42 }
43 
44 void binarize_input_gpu(float *input, int n, int size, float *binary)
45 {
46  binarize_input_kernel<<<cuda_gridsize(size), BLOCK>>>(input, n, size, binary);
47  check_error(cudaPeekAtLastError());
48 }
49 
50 
51 __global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
52 {
53  int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
54  if (f >= n) return;
55  int i = 0;
56  float mean = 0;
57  for(i = 0; i < size; ++i){
58  mean += fabsf(weights[f*size + i]);
59  }
60  mean = mean / size;
61  for(i = 0; i < size; ++i){
62  binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
63  //binary[f*size + i] = weights[f*size + i];
64  }
65 }
66 
67 void binarize_weights_gpu(float *weights, int n, int size, float *binary)
68 {
69  binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary);
70  check_error(cudaPeekAtLastError());
71 }
72 
74 {
75  fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
76  if(l.binary){
77  binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
78  swap_binary(&l);
79  }
80 
81  if(l.xnor){
82  binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
83  swap_binary(&l);
84  binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
85  net.input_gpu = l.binary_input_gpu;
86  }
87 
88 #ifdef CUDNN
89  float one = 1;
90  cudnnConvolutionForward(cudnn_handle(),
91  &one,
92  l.srcTensorDesc,
93  net.input_gpu,
94  l.weightDesc,
95  l.weights_gpu,
96  l.convDesc,
97  l.fw_algo,
98  net.workspace,
100  &one,
101  l.dstTensorDesc,
102  l.output_gpu);
103 
104 #else
105  int i, j;
106  int m = l.n/l.groups;
107  int k = l.size*l.size*l.c/l.groups;
108  int n = l.out_w*l.out_h;
109  for(i = 0; i < l.batch; ++i){
110  for(j = 0; j < l.groups; ++j){
111  float *a = l.weights_gpu + j*l.nweights/l.groups;
112  float *b = net.workspace;
113  float *c = l.output_gpu + (i*l.groups + j)*n*m;
114  float *im = net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
115 
116  if (l.size == 1){
117  b = im;
118  } else {
119  im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
120  }
121  gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
122  }
123  }
124 #endif
125 
126  if (l.batch_normalize) {
127  forward_batchnorm_layer_gpu(l, net);
128  } else {
129  add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
130  }
131 
132  activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
133  //if(l.dot > 0) dot_error_gpu(l);
134  if(l.binary || l.xnor) swap_binary(&l);
135 }
136 
137 __global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
138 {
139  int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
140  if(id >= n) return;
141 
142  int j = id % w;
143  id /= w;
144  int i = id % h;
145  id /= h;
146  int k = id % c;
147  id /= c;
148  int b = id;
149 
150  int w_offset = -(size/2.f);
151  int h_offset = -(size/2.f);
152 
153  int out_index = j + w*(i + h*(k + c*b));
154  int l, m;
155  for(l = 0; l < size; ++l){
156  for(m = 0; m < size; ++m){
157  int cur_h = h_offset + i + l;
158  int cur_w = w_offset + j + m;
159  int index = cur_w + w*(cur_h + h*(k + b*c));
160  int valid = (cur_h >= 0 && cur_h < h &&
161  cur_w >= 0 && cur_w < w);
162  delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
163  }
164  }
165 }
166 
167 extern "C" void smooth_layer(layer l, int size, float rate)
168 {
169  int h = l.out_h;
170  int w = l.out_w;
171  int c = l.out_c;
172 
173  size_t n = h*w*c*l.batch;
174 
175  smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu);
176  check_error(cudaPeekAtLastError());
177 }
178 
180 {
181  if(l.smooth){
182  smooth_layer(l, 5, l.smooth);
183  }
184  //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
185  gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
186 
187 
188  if(l.batch_normalize){
189  backward_batchnorm_layer_gpu(l, net);
190  } else {
191  backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
192  }
193  float *original_input = net.input_gpu;
194 
195  if(l.xnor) net.input_gpu = l.binary_input_gpu;
196 #ifdef CUDNN
197  float one = 1;
198  cudnnConvolutionBackwardFilter(cudnn_handle(),
199  &one,
200  l.srcTensorDesc,
201  net.input_gpu,
202  l.ddstTensorDesc,
203  l.delta_gpu,
204  l.convDesc,
205  l.bf_algo,
206  net.workspace,
207  l.workspace_size,
208  &one,
209  l.dweightDesc,
210  l.weight_updates_gpu);
211 
212  if(net.delta_gpu){
213  if(l.binary || l.xnor) swap_binary(&l);
214  cudnnConvolutionBackwardData(cudnn_handle(),
215  &one,
216  l.weightDesc,
217  l.weights_gpu,
218  l.ddstTensorDesc,
219  l.delta_gpu,
220  l.convDesc,
221  l.bd_algo,
222  net.workspace,
223  l.workspace_size,
224  &one,
225  l.dsrcTensorDesc,
226  net.delta_gpu);
227  if(l.binary || l.xnor) swap_binary(&l);
228  if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
229  }
230 
231 #else
232  int m = l.n/l.groups;
233  int n = l.size*l.size*l.c/l.groups;
234  int k = l.out_w*l.out_h;
235 
236  int i, j;
237  for(i = 0; i < l.batch; ++i){
238  for(j = 0; j < l.groups; ++j){
239  float *a = l.delta_gpu + (i*l.groups + j)*m*k;
240  float *b = net.workspace;
241  float *c = l.weight_updates_gpu + j*l.nweights/l.groups;
242 
243  float *im = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
244  float *imd = net.delta_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
245 
246  im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
247  gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
248 
249  if (net.delta_gpu) {
250  if (l.binary || l.xnor) swap_binary(&l);
251  a = l.weights_gpu + j*l.nweights/l.groups;
252  b = l.delta_gpu + (i*l.groups + j)*m*k;
253  c = net.workspace;
254  if (l.size == 1) {
255  c = imd;
256  }
257 
258  gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);
259 
260  if (l.size != 1) {
261  col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
262  }
263  if(l.binary || l.xnor) {
264  swap_binary(&l);
265  }
266  }
267  if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
268  }
269  }
270 #endif
271 }
272 
274 {
275  cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
276  cuda_pull_array(l.biases_gpu, l.biases, l.n);
277  cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
278  cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
279  if (l.batch_normalize){
280  cuda_pull_array(l.scales_gpu, l.scales, l.n);
281  cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
282  cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
283  }
284 }
285 
287 {
288  cuda_push_array(l.weights_gpu, l.weights, l.nweights);
289  cuda_push_array(l.biases_gpu, l.biases, l.n);
290  cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
291  cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
292  if (l.batch_normalize){
293  cuda_push_array(l.scales_gpu, l.scales, l.n);
294  cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
295  cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
296  }
297 }
298 
300 {
301  float learning_rate = a.learning_rate*l.learning_rate_scale;
302  float momentum = a.momentum;
303  float decay = a.decay;
304  int batch = a.batch;
305 
306  if(a.adam){
307  adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
308  adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
309  if(l.scales_gpu){
310  adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
311  }
312  }else{
313  axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
314  axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
315  scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);
316 
317  axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
318  scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
319 
320  if(l.scales_gpu){
321  axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
322  scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
323  }
324  }
325  if(l.clip){
326  constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
327  }
328 }
329 
330 
size_t workspace_size
Definition: darknet.h:336
float momentum
Definition: darknet.h:104
ACTIVATION activation
Definition: darknet.h:121
float * scales
Definition: darknet.h:239
float * biases
Definition: darknet.h:236
void forward_convolutional_layer_gpu(convolutional_layer l, network net)
void add_bias_gpu(float *output, float *biases, int batch, int n, int size)
Definition: blas_kernels.cu:69
float * weight_updates
Definition: darknet.h:243
float clip
Definition: darknet.h:169
int w
Definition: darknet.h:140
int pad
Definition: darknet.h:151
int n
Definition: darknet.h:142
void col2im_gpu(float *data_col, int channels, int height, int width, int ksize, int stride, int pad, float *data_im)
void binarize_weights_gpu(float *weights, int n, int size, float *binary)
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
int binary
Definition: darknet.h:155
float learning_rate
Definition: darknet.h:103
float * rolling_variance
Definition: darknet.h:259
__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
float smooth
Definition: darknet.h:160
void axpy_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void pull_convolutional_layer(layer l)
float decay
Definition: darknet.h:105
void im2col_gpu(float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col)
int out_w
Definition: darknet.h:141
float B1
Definition: darknet.h:107
int nweights
Definition: darknet.h:136
__global__ void binarize_kernel(float *x, int n, float *binary)
int out_c
Definition: darknet.h:141
void fill_gpu(int N, float ALPHA, float *X, int INCX)
float * workspace
Definition: darknet.h:487
void constrain_gpu(int N, float ALPHA, float *X, int INCX)
int batch_normalize
Definition: darknet.h:129
int size
Definition: darknet.h:145
int batch
Definition: darknet.h:102
void scal_gpu(int N, float ALPHA, float *X, int INCX)
int xnor
Definition: darknet.h:156
int h
Definition: darknet.h:140
int out_h
Definition: darknet.h:141
void binarize_gpu(float *x, int n, float *binary)
int adam
Definition: darknet.h:106
void backward_convolutional_layer_gpu(convolutional_layer l, network net)
int batch
Definition: darknet.h:131
float * rolling_mean
Definition: darknet.h:258
__global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
int groups
Definition: darknet.h:144
float * bias_updates
Definition: darknet.h:237
void push_convolutional_layer(layer l)
float learning_rate_scale
Definition: darknet.h:168
int stride
Definition: darknet.h:147
void binarize_input_gpu(float *input, int n, int size, float *binary)
int c
Definition: darknet.h:140
float eps
Definition: darknet.h:109
void update_convolutional_layer_gpu(layer l, update_args a)
float B2
Definition: darknet.h:108
void activate_array_gpu(float *x, int n, ACTIVATION a)
int outputs
Definition: darknet.h:135
void swap_binary(convolutional_layer *l)
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
Definition: darknet.h:119
void smooth_layer(layer l, int size, float rate)
float * weights
Definition: darknet.h:242