12 #include "xnor_layer.h" 22 swap = l->weights_gpu;
23 l->weights_gpu = l->binary_weights_gpu;
24 l->binary_weights_gpu = swap;
31 for(f = 0; f < n; ++f){
33 for(i = 0; i < size; ++i){
34 mean += fabs(weights[f*size + i]);
37 for(i = 0; i < size; ++i){
38 binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
46 for(i = 0; i < n; ++i){
47 binary[i] = (input[i] > 0) ? 1 : -1;
54 for(s = 0; s < size; ++s){
56 for(i = 0; i < n; ++i){
57 mean += fabs(input[i*size + s]);
60 for(i = 0; i < n; ++i){
61 binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
86 static size_t get_workspace_size(
layer l){
91 cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
98 if (s > most) most = s;
99 cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
106 if (s > most) most = s;
107 cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
114 if (s > most) most = s;
123 void cudnn_convolutional_setup(
layer *l)
125 cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->
batch, l->
c, l->
h, l->
w);
126 cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->
batch, l->
out_c, l->
out_h, l->
out_w);
128 cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->
batch, l->
c, l->
h, l->
w);
129 cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->
batch, l->
out_c, l->
out_h, l->
out_w);
130 cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->
out_c, 1, 1);
132 cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->
n, l->
c/l->
groups, l->
size, l->
size);
133 cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->
n, l->
c/l->
groups, l->
size, l->
size);
135 cudnnSetConvolution2dDescriptor(l->convDesc, l->
pad, l->
pad, l->
stride, l->
stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
137 cudnnSetConvolution2dDescriptor(l->convDesc, l->
pad, l->
pad, l->
stride, l->
stride, 1, 1, CUDNN_CROSS_CORRELATION);
141 cudnnSetConvolutionGroupCount(l->convDesc, l->
groups);
144 error(
"CUDNN < 7 doesn't support groups, please upgrade!");
148 cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
153 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
156 cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
161 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
164 cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
169 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
176 convolutional_layer make_convolutional_layer(
int batch,
int h,
int w,
int c,
int n,
int groups,
int size,
int stride,
int padding,
ACTIVATION activation,
int batch_normalize,
int binary,
int xnor,
int adam)
195 l.
weights = calloc(c/groups*n*size*size,
sizeof(
float));
198 l.
biases = calloc(n,
sizeof(
float));
205 float scale = sqrt(2./(size*size*c/l.
groups));
227 l.
scales = calloc(n,
sizeof(
float));
235 l.
scales = calloc(n,
sizeof(
float));
237 for(i = 0; i < n; ++i){
241 l.
mean = calloc(n,
sizeof(
float));
242 l.
variance = calloc(n,
sizeof(
float));
255 l.
bias_m = calloc(n,
sizeof(
float));
256 l.
scale_m = calloc(n,
sizeof(
float));
257 l.
bias_v = calloc(n,
sizeof(
float));
258 l.
scale_v = calloc(n,
sizeof(
float));
268 l.m_gpu = cuda_make_array(l.
m, l.
nweights);
269 l.v_gpu = cuda_make_array(l.
v, l.
nweights);
270 l.bias_m_gpu = cuda_make_array(l.
bias_m, n);
271 l.bias_v_gpu = cuda_make_array(l.
bias_v, n);
272 l.scale_m_gpu = cuda_make_array(l.
scale_m, n);
273 l.scale_v_gpu = cuda_make_array(l.
scale_v, n);
279 l.biases_gpu = cuda_make_array(l.
biases, n);
280 l.bias_updates_gpu = cuda_make_array(l.
bias_updates, n);
282 l.delta_gpu = cuda_make_array(l.
delta, l.
batch*out_h*out_w*n);
283 l.output_gpu = cuda_make_array(l.
output, l.
batch*out_h*out_w*n);
290 l.binary_input_gpu = cuda_make_array(0, l.
inputs*l.
batch);
294 l.mean_gpu = cuda_make_array(l.
mean, n);
295 l.variance_gpu = cuda_make_array(l.
variance, n);
297 l.rolling_mean_gpu = cuda_make_array(l.
mean, n);
298 l.rolling_variance_gpu = cuda_make_array(l.
variance, n);
300 l.mean_delta_gpu = cuda_make_array(l.
mean, n);
301 l.variance_delta_gpu = cuda_make_array(l.
variance, n);
303 l.scales_gpu = cuda_make_array(l.
scales, n);
306 l.x_gpu = cuda_make_array(l.
output, l.
batch*out_h*out_w*n);
307 l.x_norm_gpu = cuda_make_array(l.
output, l.
batch*out_h*out_w*n);
310 cudnnCreateTensorDescriptor(&l.normTensorDesc);
311 cudnnCreateTensorDescriptor(&l.srcTensorDesc);
312 cudnnCreateTensorDescriptor(&l.dstTensorDesc);
313 cudnnCreateFilterDescriptor(&l.weightDesc);
314 cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
315 cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
316 cudnnCreateFilterDescriptor(&l.dweightDesc);
317 cudnnCreateConvolutionDescriptor(&l.convDesc);
318 cudnn_convolutional_setup(&l);
325 fprintf(stderr,
"conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.
out_w, l.
out_h, l.
out_c, (2.0 * l.
n * l.
size*l.
size*l.
c/l.
groups * l.
out_h*l.
out_w)/1000000000.);
333 for(i = 0; i < l.
n; ++i){
391 cuda_free(l->delta_gpu);
392 cuda_free(l->output_gpu);
399 cuda_free(l->x_norm_gpu);
405 cudnn_convolutional_setup(l);
411 void add_bias(
float *output,
float *biases,
int batch,
int n,
int size)
414 for(b = 0; b < batch; ++b){
415 for(i = 0; i < n; ++i){
416 for(j = 0; j < size; ++j){
417 output[(b*n + i)*size + j] += biases[i];
423 void scale_bias(
float *output,
float *scales,
int batch,
int n,
int size)
426 for(b = 0; b < batch; ++b){
427 for(i = 0; i < n; ++i){
428 for(j = 0; j < size; ++j){
429 output[(b*n + i)*size + j] *= scales[i];
435 void backward_bias(
float *bias_updates,
float *delta,
int batch,
int n,
int size)
438 for(b = 0; b < batch; ++b){
439 for(i = 0; i < n; ++i){
440 bias_updates[i] +=
sum_array(delta+size*(i+b*n), size);
461 for(i = 0; i < l.
batch; ++i){
462 for(j = 0; j < l.
groups; ++j){
473 gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
502 for(i = 0; i < l.
batch; ++i){
503 for(j = 0; j < l.
groups; ++j){
518 gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
528 gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
542 float decay = a.
decay;
570 for(i = 0; i < l.
n; ++i){
581 for(i = 0; i < l.
n; ++i){
595 for(i = 0; i < l.
n; ++i){
616 sprintf(buff,
"%s: Output", window);
620 return single_weights;
void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
image * visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
image copy_image(image p)
void forward_convolutional_layer_gpu(convolutional_layer l, network net)
void(* update)(struct layer, update_args)
void(* forward_gpu)(struct layer, struct network)
void add_bias(float *output, float *biases, int batch, int n, int size)
void(* backward_gpu)(struct layer, struct network)
void denormalize_convolutional_layer(convolutional_layer l)
void(* update_gpu)(struct layer, update_args)
int convolutional_out_width(convolutional_layer l)
void(* forward)(struct layer, struct network)
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
image float_to_image(int w, int h, int c, float *data)
void show_images(image *ims, int n, char *window)
image get_convolutional_image(convolutional_layer l)
void fill_cpu(int N, float ALPHA, float *X, int INCX)
int convolutional_out_height(convolutional_layer l)
void rgbgr_weights(convolutional_layer l)
void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc)
void normalize_image(image p)
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void(* backward)(struct layer, struct network)
void backward_convolutional_layer_gpu(convolutional_layer l, network net)
image get_convolutional_delta(convolutional_layer l)
void forward_convolutional_layer(convolutional_layer l, network net)
image get_convolutional_weight(convolutional_layer l, int i)
void scal_cpu(int N, float ALPHA, float *X, int INCX)
float sum_array(float *a, int n)
void backward_convolutional_layer(convolutional_layer l, network net)
void resize_convolutional_layer(convolutional_layer *l, int w, int h)
void binarize_weights(float *weights, int n, int size, float *binary)
void binarize_input(float *input, int n, int size, float *binary)
void im2col_cpu(float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col)
image * get_weights(convolutional_layer l)
float learning_rate_scale
void binarize_cpu(float *input, int n, float *binary)
void activate_array(float *x, const int n, const ACTIVATION a)
void update_convolutional_layer(convolutional_layer l, update_args a)
void update_convolutional_layer_gpu(layer l, update_args a)
void rescale_weights(convolutional_layer l, float scale, float trans)
void forward_batchnorm_layer(layer l, network net)
image collapse_image_layers(image source, int border)
void scale_bias(float *output, float *scales, int batch, int n, int size)
void col2im_cpu(float *data_col, int channels, int height, int width, int ksize, int stride, int pad, float *data_im)
void backward_batchnorm_layer(layer l, network net)
void swap_binary(convolutional_layer *l)
void error(const char *s)
void scale_image(image m, float s)