1 #include "cuda_runtime.h" 18 int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
20 binary[i] = (x[i] >= 0) ? 1 : -1;
25 binarize_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, binary);
26 check_error(cudaPeekAtLastError());
31 int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
32 if (s >= size)
return;
35 for(i = 0; i < n; ++i){
36 mean += fabsf(input[i*size + s]);
39 for(i = 0; i < n; ++i){
40 binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
46 binarize_input_kernel<<<cuda_gridsize(size), BLOCK>>>(input, n, size, binary);
47 check_error(cudaPeekAtLastError());
53 int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
57 for(i = 0; i < size; ++i){
58 mean += fabsf(weights[f*size + i]);
61 for(i = 0; i < size; ++i){
62 binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
69 binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary);
70 check_error(cudaPeekAtLastError());
85 net.input_gpu = l.binary_input_gpu;
90 cudnnConvolutionForward(cudnn_handle(),
109 for(i = 0; i < l.
batch; ++i){
110 for(j = 0; j < l.
groups; ++j){
113 float *c = l.output_gpu + (i*l.
groups + j)*n*m;
121 gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
127 forward_batchnorm_layer_gpu(l, net);
137 __global__
void smooth_kernel(
float *x,
int n,
int w,
int h,
int c,
int size,
float rate,
float *delta)
139 int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
150 int w_offset = -(size/2.f);
151 int h_offset = -(size/2.f);
153 int out_index = j + w*(i + h*(k + c*b));
155 for(l = 0; l < size; ++l){
156 for(m = 0; m < size; ++m){
157 int cur_h = h_offset + i + l;
158 int cur_w = w_offset + j + m;
159 int index = cur_w + w*(cur_h + h*(k + b*c));
160 int valid = (cur_h >= 0 && cur_h < h &&
161 cur_w >= 0 && cur_w < w);
162 delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
173 size_t n = h*w*c*l.
batch;
175 smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.
w, l.
h, l.
c, size, rate, l.delta_gpu);
176 check_error(cudaPeekAtLastError());
189 backward_batchnorm_layer_gpu(l, net);
193 float *original_input = net.input_gpu;
195 if(l.
xnor) net.input_gpu = l.binary_input_gpu;
198 cudnnConvolutionBackwardFilter(cudnn_handle(),
210 l.weight_updates_gpu);
214 cudnnConvolutionBackwardData(cudnn_handle(),
237 for(i = 0; i < l.
batch; ++i){
238 for(j = 0; j < l.
groups; ++j){
239 float *a = l.delta_gpu + (i*l.
groups + j)*m*k;
247 gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
252 b = l.delta_gpu + (i*l.
groups + j)*m*k;
258 gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);
276 cuda_pull_array(l.biases_gpu, l.
biases, l.
n);
280 cuda_pull_array(l.scales_gpu, l.
scales, l.
n);
289 cuda_push_array(l.biases_gpu, l.
biases, l.
n);
293 cuda_push_array(l.scales_gpu, l.
scales, l.
n);
303 float decay = a.
decay;
307 adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.
B1, a.
B2, a.
eps, decay, learning_rate, l.
nweights, batch, a.
t);
308 adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.
B1, a.
B2, a.
eps, decay, learning_rate, l.
n, batch, a.
t);
310 adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.
B1, a.
B2, a.
eps, decay, learning_rate, l.
n, batch, a.
t);
313 axpy_gpu(l.
nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
314 axpy_gpu(l.
nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
317 axpy_gpu(l.
n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
318 scal_gpu(l.
n, momentum, l.bias_updates_gpu, 1);
321 axpy_gpu(l.
n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
322 scal_gpu(l.
n, momentum, l.scale_updates_gpu, 1);
void forward_convolutional_layer_gpu(convolutional_layer l, network net)
void add_bias_gpu(float *output, float *biases, int batch, int n, int size)
void col2im_gpu(float *data_col, int channels, int height, int width, int ksize, int stride, int pad, float *data_im)
void binarize_weights_gpu(float *weights, int n, int size, float *binary)
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
void axpy_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void pull_convolutional_layer(layer l)
void im2col_gpu(float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col)
__global__ void binarize_kernel(float *x, int n, float *binary)
void fill_gpu(int N, float ALPHA, float *X, int INCX)
void constrain_gpu(int N, float ALPHA, float *X, int INCX)
void scal_gpu(int N, float ALPHA, float *X, int INCX)
void binarize_gpu(float *x, int n, float *binary)
void backward_convolutional_layer_gpu(convolutional_layer l, network net)
__global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
void push_convolutional_layer(layer l)
float learning_rate_scale
void binarize_input_gpu(float *input, int n, int size, float *binary)
void update_convolutional_layer_gpu(layer l, update_args a)
void activate_array_gpu(float *x, int n, ACTIVATION a)
void swap_binary(convolutional_layer *l)
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
void smooth_layer(layer l, int size, float rate)