darknet  v3
gru_layer.c
Go to the documentation of this file.
1 #include "gru_layer.h"
2 #include "connected_layer.h"
3 #include "utils.h"
4 #include "cuda.h"
5 #include "blas.h"
6 #include "gemm.h"
7 
8 #include <math.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 static void increment_layer(layer *l, int steps)
14 {
15  int num = l->outputs*l->batch*steps;
16  l->output += num;
17  l->delta += num;
18  l->x += num;
19  l->x_norm += num;
20 
21 #ifdef GPU
22  l->output_gpu += num;
23  l->delta_gpu += num;
24  l->x_gpu += num;
25  l->x_norm_gpu += num;
26 #endif
27 }
28 
29 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
30 {
31  fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
32  batch = batch / steps;
33  layer l = {0};
34  l.batch = batch;
35  l.type = GRU;
36  l.steps = steps;
37  l.inputs = inputs;
38 
39  l.uz = malloc(sizeof(layer));
40  fprintf(stderr, "\t\t");
41  *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
42  l.uz->batch = batch;
43 
44  l.wz = malloc(sizeof(layer));
45  fprintf(stderr, "\t\t");
46  *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
47  l.wz->batch = batch;
48 
49  l.ur = malloc(sizeof(layer));
50  fprintf(stderr, "\t\t");
51  *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
52  l.ur->batch = batch;
53 
54  l.wr = malloc(sizeof(layer));
55  fprintf(stderr, "\t\t");
56  *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
57  l.wr->batch = batch;
58 
59 
60 
61  l.uh = malloc(sizeof(layer));
62  fprintf(stderr, "\t\t");
63  *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
64  l.uh->batch = batch;
65 
66  l.wh = malloc(sizeof(layer));
67  fprintf(stderr, "\t\t");
68  *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
69  l.wh->batch = batch;
70 
71  l.batch_normalize = batch_normalize;
72 
73 
74  l.outputs = outputs;
75  l.output = calloc(outputs*batch*steps, sizeof(float));
76  l.delta = calloc(outputs*batch*steps, sizeof(float));
77  l.state = calloc(outputs*batch, sizeof(float));
78  l.prev_state = calloc(outputs*batch, sizeof(float));
79  l.forgot_state = calloc(outputs*batch, sizeof(float));
80  l.forgot_delta = calloc(outputs*batch, sizeof(float));
81 
82  l.r_cpu = calloc(outputs*batch, sizeof(float));
83  l.z_cpu = calloc(outputs*batch, sizeof(float));
84  l.h_cpu = calloc(outputs*batch, sizeof(float));
85 
89 
90 #ifdef GPU
91  l.forward_gpu = forward_gru_layer_gpu;
92  l.backward_gpu = backward_gru_layer_gpu;
93  l.update_gpu = update_gru_layer_gpu;
94 
95  l.forgot_state_gpu = cuda_make_array(0, batch*outputs);
96  l.forgot_delta_gpu = cuda_make_array(0, batch*outputs);
97  l.prev_state_gpu = cuda_make_array(0, batch*outputs);
98  l.state_gpu = cuda_make_array(0, batch*outputs);
99  l.output_gpu = cuda_make_array(0, batch*outputs*steps);
100  l.delta_gpu = cuda_make_array(0, batch*outputs*steps);
101  l.r_gpu = cuda_make_array(0, batch*outputs);
102  l.z_gpu = cuda_make_array(0, batch*outputs);
103  l.h_gpu = cuda_make_array(0, batch*outputs);
104 
105 #ifdef CUDNN
106  cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w);
107  cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w);
108  cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w);
109  cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w);
110  cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w);
111  cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w);
112 #endif
113 #endif
114 
115  return l;
116 }
117 
119 {
120  update_connected_layer(*(l.ur), a);
121  update_connected_layer(*(l.uz), a);
122  update_connected_layer(*(l.uh), a);
123  update_connected_layer(*(l.wr), a);
124  update_connected_layer(*(l.wz), a);
125  update_connected_layer(*(l.wh), a);
126 }
127 
129 {
130  network s = net;
131  s.train = net.train;
132  int i;
133  layer uz = *(l.uz);
134  layer ur = *(l.ur);
135  layer uh = *(l.uh);
136 
137  layer wz = *(l.wz);
138  layer wr = *(l.wr);
139  layer wh = *(l.wh);
140 
141  fill_cpu(l.outputs * l.batch * l.steps, 0, uz.delta, 1);
142  fill_cpu(l.outputs * l.batch * l.steps, 0, ur.delta, 1);
143  fill_cpu(l.outputs * l.batch * l.steps, 0, uh.delta, 1);
144 
145  fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
146  fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
147  fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
148  if(net.train) {
149  fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
150  copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
151  }
152 
153  for (i = 0; i < l.steps; ++i) {
154  s.input = l.state;
157 
158  s.input = net.input;
162 
163 
164  copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
165  axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
166 
167  copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
168  axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
169 
172 
173  copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1);
174  mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
175 
176  s.input = l.forgot_state;
178 
179  copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
180  axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
181 
182  if(l.tanh){
184  } else {
186  }
187 
189 
190  copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
191 
192  net.input += l.inputs*l.batch;
193  l.output += l.outputs*l.batch;
194  increment_layer(&uz, 1);
195  increment_layer(&ur, 1);
196  increment_layer(&uh, 1);
197 
198  increment_layer(&wz, 1);
199  increment_layer(&wr, 1);
200  increment_layer(&wh, 1);
201  }
202 }
203 
205 {
206 }
207 
208 #ifdef GPU
209 
210 void pull_gru_layer(layer l)
211 {
212 }
213 
214 void push_gru_layer(layer l)
215 {
216 }
217 
218 void update_gru_layer_gpu(layer l, update_args a)
219 {
220  update_connected_layer_gpu(*(l.ur), a);
221  update_connected_layer_gpu(*(l.uz), a);
222  update_connected_layer_gpu(*(l.uh), a);
223  update_connected_layer_gpu(*(l.wr), a);
224  update_connected_layer_gpu(*(l.wz), a);
225  update_connected_layer_gpu(*(l.wh), a);
226 }
227 
228 void forward_gru_layer_gpu(layer l, network net)
229 {
230  network s = {0};
231  s.train = net.train;
232  int i;
233  layer uz = *(l.uz);
234  layer ur = *(l.ur);
235  layer uh = *(l.uh);
236 
237  layer wz = *(l.wz);
238  layer wr = *(l.wr);
239  layer wh = *(l.wh);
240 
241  fill_gpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
242  fill_gpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
243  fill_gpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
244 
245  fill_gpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
246  fill_gpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
247  fill_gpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
248  if(net.train) {
249  fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
250  copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
251  }
252 
253  for (i = 0; i < l.steps; ++i) {
254  s.input_gpu = l.state_gpu;
255  forward_connected_layer_gpu(wz, s);
256  forward_connected_layer_gpu(wr, s);
257 
258  s.input_gpu = net.input_gpu;
259  forward_connected_layer_gpu(uz, s);
260  forward_connected_layer_gpu(ur, s);
261  forward_connected_layer_gpu(uh, s);
262 
263  copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
264  axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
265 
266  copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
267  axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
268 
269  activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
270  activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
271 
272  copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
273  mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
274 
275  s.input_gpu = l.forgot_state_gpu;
276  forward_connected_layer_gpu(wh, s);
277 
278  copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
279  axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
280 
281  if(l.tanh){
282  activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
283  } else {
284  activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
285  }
286 
287  weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
288  copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
289 
290  net.input_gpu += l.inputs*l.batch;
291  l.output_gpu += l.outputs*l.batch;
292  increment_layer(&uz, 1);
293  increment_layer(&ur, 1);
294  increment_layer(&uh, 1);
295 
296  increment_layer(&wz, 1);
297  increment_layer(&wr, 1);
298  increment_layer(&wh, 1);
299  }
300 }
301 
302 void backward_gru_layer_gpu(layer l, network net)
303 {
304  network s = {0};
305  s.train = net.train;
306  int i;
307  layer uz = *(l.uz);
308  layer ur = *(l.ur);
309  layer uh = *(l.uh);
310 
311  layer wz = *(l.wz);
312  layer wr = *(l.wr);
313  layer wh = *(l.wh);
314 
315  increment_layer(&uz, l.steps - 1);
316  increment_layer(&ur, l.steps - 1);
317  increment_layer(&uh, l.steps - 1);
318 
319  increment_layer(&wz, l.steps - 1);
320  increment_layer(&wr, l.steps - 1);
321  increment_layer(&wh, l.steps - 1);
322 
323  net.input_gpu += l.inputs*l.batch*(l.steps-1);
324  if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1);
325  l.output_gpu += l.outputs*l.batch*(l.steps-1);
326  l.delta_gpu += l.outputs*l.batch*(l.steps-1);
327  float *end_state = l.output_gpu;
328  for (i = l.steps-1; i >= 0; --i) {
329  if(i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
330  else copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
331  float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
332 
333  copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
334  axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
335 
336  copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
337  axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
338 
339  activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
340  activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
341 
342  copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
343  axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
344 
345  if(l.tanh){
346  activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
347  } else {
348  activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
349  }
350 
351  weighted_delta_gpu(l.state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
352 
353  if(l.tanh){
354  gradient_array_gpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
355  } else {
356  gradient_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
357  }
358 
359  copy_gpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
360 
361  copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
362  mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
363  fill_gpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
364 
365  s.input_gpu = l.forgot_state_gpu;
366  s.delta_gpu = l.forgot_delta_gpu;
367 
368  backward_connected_layer_gpu(wh, s);
369  if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
370  mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.state_gpu, ur.delta_gpu);
371 
372  gradient_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
373  copy_gpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
374 
375  gradient_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
376  copy_gpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
377 
378  s.input_gpu = l.state_gpu;
379  s.delta_gpu = prev_delta_gpu;
380 
381  backward_connected_layer_gpu(wr, s);
382  backward_connected_layer_gpu(wz, s);
383 
384  s.input_gpu = net.input_gpu;
385  s.delta_gpu = net.delta_gpu;
386 
387  backward_connected_layer_gpu(uh, s);
388  backward_connected_layer_gpu(ur, s);
389  backward_connected_layer_gpu(uz, s);
390 
391 
392  net.input_gpu -= l.inputs*l.batch;
393  if(net.delta_gpu) net.delta_gpu -= l.inputs*l.batch;
394  l.output_gpu -= l.outputs*l.batch;
395  l.delta_gpu -= l.outputs*l.batch;
396  increment_layer(&uz, -1);
397  increment_layer(&ur, -1);
398  increment_layer(&uh, -1);
399 
400  increment_layer(&wz, -1);
401  increment_layer(&wr, -1);
402  increment_layer(&wh, -1);
403  }
404  copy_gpu(l.outputs*l.batch, end_state, 1, l.state_gpu, 1);
405 }
406 #endif
int steps
Definition: darknet.h:157
void update_connected_layer(layer l, update_args a)
void(* update)(struct layer, update_args)
Definition: darknet.h:125
void(* forward_gpu)(struct layer, struct network)
Definition: darknet.h:126
float * forgot_state
Definition: darknet.h:225
void(* backward_gpu)(struct layer, struct network)
Definition: darknet.h:127
float * x
Definition: darknet.h:261
void axpy_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void(* update_gpu)(struct layer, update_args)
Definition: darknet.h:128
struct layer * uz
Definition: darknet.h:320
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY)
Definition: blas.c:166
void(* forward)(struct layer, struct network)
Definition: darknet.h:123
int out_w
Definition: darknet.h:141
struct layer * uh
Definition: darknet.h:324
int out_c
Definition: darknet.h:141
void mul_gpu(int N, float *X, int INCX, float *Y, int INCY)
void fill_gpu(int N, float ALPHA, float *X, int INCX)
struct layer * wh
Definition: darknet.h:323
layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
Definition: gru_layer.c:29
void forward_connected_layer(layer l, network net)
float * h_cpu
Definition: darknet.h:275
int batch_normalize
Definition: darknet.h:129
float * z_cpu
Definition: darknet.h:273
void fill_cpu(int N, float ALPHA, float *X, int INCX)
Definition: blas.c:190
float * state
Definition: darknet.h:223
int train
Definition: darknet.h:488
float * delta
Definition: darknet.h:245
int out_h
Definition: darknet.h:141
int inputs
Definition: darknet.h:134
Definition: darknet.h:57
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
Definition: blas.c:178
void(* backward)(struct layer, struct network)
Definition: darknet.h:124
float * x_norm
Definition: darknet.h:262
void mult_add_into_gpu(int num, float *a, float *b, float *c)
struct layer * wz
Definition: darknet.h:319
int batch
Definition: darknet.h:131
struct layer * ur
Definition: darknet.h:322
float * output
Definition: darknet.h:246
void update_gru_layer(layer l, update_args a)
Definition: gru_layer.c:118
float * r_cpu
Definition: darknet.h:274
void copy_gpu(int N, float *X, int INCX, float *Y, int INCY)
struct layer * wr
Definition: darknet.h:321
void backward_gru_layer(layer l, network net)
Definition: gru_layer.c:204
void activate_array(float *x, const int n, const ACTIVATION a)
Definition: activations.c:100
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
Definition: blas.c:226
float * forgot_delta
Definition: darknet.h:226
layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
LAYER_TYPE type
Definition: darknet.h:120
float * input
Definition: darknet.h:484
void activate_array_gpu(float *x, int n, ACTIVATION a)
float * prev_state
Definition: darknet.h:224
int tanh
Definition: darknet.h:181
Definition: darknet.h:81
int outputs
Definition: darknet.h:135
void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
Definition: blas.c:50
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
Definition: darknet.h:119
void forward_gru_layer(layer l, network net)
Definition: gru_layer.c:128
void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c)
void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc)
Definition: darknet.h:57