darknet  v3
lstm_layer.c
Go to the documentation of this file.
1 #include "lstm_layer.h"
2 #include "connected_layer.h"
3 #include "utils.h"
4 #include "cuda.h"
5 #include "blas.h"
6 #include "gemm.h"
7 
8 #include <math.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 static void increment_layer(layer *l, int steps)
14 {
15  int num = l->outputs*l->batch*steps;
16  l->output += num;
17  l->delta += num;
18  l->x += num;
19  l->x_norm += num;
20 
21 #ifdef GPU
22  l->output_gpu += num;
23  l->delta_gpu += num;
24  l->x_gpu += num;
25  l->x_norm_gpu += num;
26 #endif
27 }
28 
29 layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
30 {
31  fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs);
32  batch = batch / steps;
33  layer l = { 0 };
34  l.batch = batch;
35  l.type = LSTM;
36  l.steps = steps;
37  l.inputs = inputs;
38 
39  l.uf = malloc(sizeof(layer));
40  fprintf(stderr, "\t\t");
41  *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
42  l.uf->batch = batch;
43 
44  l.ui = malloc(sizeof(layer));
45  fprintf(stderr, "\t\t");
46  *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
47  l.ui->batch = batch;
48 
49  l.ug = malloc(sizeof(layer));
50  fprintf(stderr, "\t\t");
51  *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
52  l.ug->batch = batch;
53 
54  l.uo = malloc(sizeof(layer));
55  fprintf(stderr, "\t\t");
56  *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
57  l.uo->batch = batch;
58 
59  l.wf = malloc(sizeof(layer));
60  fprintf(stderr, "\t\t");
61  *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
62  l.wf->batch = batch;
63 
64  l.wi = malloc(sizeof(layer));
65  fprintf(stderr, "\t\t");
66  *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
67  l.wi->batch = batch;
68 
69  l.wg = malloc(sizeof(layer));
70  fprintf(stderr, "\t\t");
71  *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
72  l.wg->batch = batch;
73 
74  l.wo = malloc(sizeof(layer));
75  fprintf(stderr, "\t\t");
76  *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
77  l.wo->batch = batch;
78 
79  l.batch_normalize = batch_normalize;
80  l.outputs = outputs;
81 
82  l.output = calloc(outputs*batch*steps, sizeof(float));
83  l.state = calloc(outputs*batch, sizeof(float));
84 
87 
88  l.prev_state_cpu = calloc(batch*outputs, sizeof(float));
89  l.prev_cell_cpu = calloc(batch*outputs, sizeof(float));
90  l.cell_cpu = calloc(batch*outputs*steps, sizeof(float));
91 
92  l.f_cpu = calloc(batch*outputs, sizeof(float));
93  l.i_cpu = calloc(batch*outputs, sizeof(float));
94  l.g_cpu = calloc(batch*outputs, sizeof(float));
95  l.o_cpu = calloc(batch*outputs, sizeof(float));
96  l.c_cpu = calloc(batch*outputs, sizeof(float));
97  l.h_cpu = calloc(batch*outputs, sizeof(float));
98  l.temp_cpu = calloc(batch*outputs, sizeof(float));
99  l.temp2_cpu = calloc(batch*outputs, sizeof(float));
100  l.temp3_cpu = calloc(batch*outputs, sizeof(float));
101  l.dc_cpu = calloc(batch*outputs, sizeof(float));
102  l.dh_cpu = calloc(batch*outputs, sizeof(float));
103 
104 #ifdef GPU
105  l.forward_gpu = forward_lstm_layer_gpu;
106  l.backward_gpu = backward_lstm_layer_gpu;
107  l.update_gpu = update_lstm_layer_gpu;
108 
109  l.output_gpu = cuda_make_array(0, batch*outputs*steps);
110  l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
111 
112  l.prev_state_gpu = cuda_make_array(0, batch*outputs);
113  l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
114  l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
115 
116  l.f_gpu = cuda_make_array(0, batch*outputs);
117  l.i_gpu = cuda_make_array(0, batch*outputs);
118  l.g_gpu = cuda_make_array(0, batch*outputs);
119  l.o_gpu = cuda_make_array(0, batch*outputs);
120  l.c_gpu = cuda_make_array(0, batch*outputs);
121  l.h_gpu = cuda_make_array(0, batch*outputs);
122  l.temp_gpu = cuda_make_array(0, batch*outputs);
123  l.temp2_gpu = cuda_make_array(0, batch*outputs);
124  l.temp3_gpu = cuda_make_array(0, batch*outputs);
125  l.dc_gpu = cuda_make_array(0, batch*outputs);
126  l.dh_gpu = cuda_make_array(0, batch*outputs);
127 #ifdef CUDNN
128  cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w);
129  cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w);
130  cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w);
131  cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w);
132 
133  cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w);
134  cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w);
135  cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w);
136  cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w);
137 #endif
138 
139 #endif
140 
141  return l;
142 }
143 
145 {
146  update_connected_layer(*(l.wf), a);
147  update_connected_layer(*(l.wi), a);
148  update_connected_layer(*(l.wg), a);
149  update_connected_layer(*(l.wo), a);
150  update_connected_layer(*(l.uf), a);
151  update_connected_layer(*(l.ui), a);
152  update_connected_layer(*(l.ug), a);
153  update_connected_layer(*(l.uo), a);
154 }
155 
157 {
158  network s = { 0 };
159  s.train = state.train;
160  int i;
161  layer wf = *(l.wf);
162  layer wi = *(l.wi);
163  layer wg = *(l.wg);
164  layer wo = *(l.wo);
165 
166  layer uf = *(l.uf);
167  layer ui = *(l.ui);
168  layer ug = *(l.ug);
169  layer uo = *(l.uo);
170 
171  fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1);
172  fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1);
173  fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1);
174  fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1);
175 
176  fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1);
177  fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1);
178  fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1);
179  fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1);
180  if (state.train) {
181  fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
182  }
183 
184  for (i = 0; i < l.steps; ++i) {
185  s.input = l.h_cpu;
186  forward_connected_layer(wf, s);
187  forward_connected_layer(wi, s);
188  forward_connected_layer(wg, s);
189  forward_connected_layer(wo, s);
190 
191  s.input = state.input;
192  forward_connected_layer(uf, s);
193  forward_connected_layer(ui, s);
194  forward_connected_layer(ug, s);
195  forward_connected_layer(uo, s);
196 
197  copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
198  axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
199 
200  copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
201  axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
202 
203  copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
204  axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
205 
206  copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
207  axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
208 
213 
214  copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
215  mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
216  mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1);
217  axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1);
218 
219  copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1);
221  mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1);
222 
223  copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1);
224  copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1);
225 
226  state.input += l.inputs*l.batch;
227  l.output += l.outputs*l.batch;
228  l.cell_cpu += l.outputs*l.batch;
229 
230  increment_layer(&wf, 1);
231  increment_layer(&wi, 1);
232  increment_layer(&wg, 1);
233  increment_layer(&wo, 1);
234 
235  increment_layer(&uf, 1);
236  increment_layer(&ui, 1);
237  increment_layer(&ug, 1);
238  increment_layer(&uo, 1);
239  }
240 }
241 
243 {
244  network s = { 0 };
245  s.train = state.train;
246  int i;
247  layer wf = *(l.wf);
248  layer wi = *(l.wi);
249  layer wg = *(l.wg);
250  layer wo = *(l.wo);
251 
252  layer uf = *(l.uf);
253  layer ui = *(l.ui);
254  layer ug = *(l.ug);
255  layer uo = *(l.uo);
256 
257  increment_layer(&wf, l.steps - 1);
258  increment_layer(&wi, l.steps - 1);
259  increment_layer(&wg, l.steps - 1);
260  increment_layer(&wo, l.steps - 1);
261 
262  increment_layer(&uf, l.steps - 1);
263  increment_layer(&ui, l.steps - 1);
264  increment_layer(&ug, l.steps - 1);
265  increment_layer(&uo, l.steps - 1);
266 
267  state.input += l.inputs*l.batch*(l.steps - 1);
268  if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
269 
270  l.output += l.outputs*l.batch*(l.steps - 1);
271  l.cell_cpu += l.outputs*l.batch*(l.steps - 1);
272  l.delta += l.outputs*l.batch*(l.steps - 1);
273 
274  for (i = l.steps - 1; i >= 0; --i) {
275  if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1);
276  copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1);
277  if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1);
278  copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1);
279 
280  l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch;
281 
282  copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
283  axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
284 
285  copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
286  axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
287 
288  copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
289  axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
290 
291  copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
292  axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
293 
298 
299  copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1);
300 
301  copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
303 
304  copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1);
305  mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1);
306 
308  axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1);
309 
310  copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
312  mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1);
314  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1);
315  s.input = l.prev_state_cpu;
316  s.delta = l.dh_cpu;
317  backward_connected_layer(wo, s);
318 
319  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1);
320  s.input = state.input;
321  s.delta = state.delta;
322  backward_connected_layer(uo, s);
323 
324  copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
325  mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
327  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1);
328  s.input = l.prev_state_cpu;
329  s.delta = l.dh_cpu;
330  backward_connected_layer(wg, s);
331 
332  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1);
333  s.input = state.input;
334  s.delta = state.delta;
335  backward_connected_layer(ug, s);
336 
337  copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
338  mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
340  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1);
341  s.input = l.prev_state_cpu;
342  s.delta = l.dh_cpu;
343  backward_connected_layer(wi, s);
344 
345  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1);
346  s.input = state.input;
347  s.delta = state.delta;
348  backward_connected_layer(ui, s);
349 
350  copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
351  mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1);
353  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1);
354  s.input = l.prev_state_cpu;
355  s.delta = l.dh_cpu;
356  backward_connected_layer(wf, s);
357 
358  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1);
359  s.input = state.input;
360  s.delta = state.delta;
361  backward_connected_layer(uf, s);
362 
363  copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
364  mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1);
365  copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1);
366 
367  state.input -= l.inputs*l.batch;
368  if (state.delta) state.delta -= l.inputs*l.batch;
369  l.output -= l.outputs*l.batch;
370  l.cell_cpu -= l.outputs*l.batch;
371  l.delta -= l.outputs*l.batch;
372 
373  increment_layer(&wf, -1);
374  increment_layer(&wi, -1);
375  increment_layer(&wg, -1);
376  increment_layer(&wo, -1);
377 
378  increment_layer(&uf, -1);
379  increment_layer(&ui, -1);
380  increment_layer(&ug, -1);
381  increment_layer(&uo, -1);
382  }
383 }
384 
385 #ifdef GPU
386 void update_lstm_layer_gpu(layer l, update_args a)
387 {
388  update_connected_layer_gpu(*(l.wf), a);
389  update_connected_layer_gpu(*(l.wi), a);
390  update_connected_layer_gpu(*(l.wg), a);
391  update_connected_layer_gpu(*(l.wo), a);
392  update_connected_layer_gpu(*(l.uf), a);
393  update_connected_layer_gpu(*(l.ui), a);
394  update_connected_layer_gpu(*(l.ug), a);
395  update_connected_layer_gpu(*(l.uo), a);
396 }
397 
398 void forward_lstm_layer_gpu(layer l, network state)
399 {
400  network s = { 0 };
401  s.train = state.train;
402  int i;
403  layer wf = *(l.wf);
404  layer wi = *(l.wi);
405  layer wg = *(l.wg);
406  layer wo = *(l.wo);
407 
408  layer uf = *(l.uf);
409  layer ui = *(l.ui);
410  layer ug = *(l.ug);
411  layer uo = *(l.uo);
412 
413  fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
414  fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
415  fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
416  fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
417 
418  fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
419  fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
420  fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
421  fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
422  if (state.train) {
423  fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
424  }
425 
426  for (i = 0; i < l.steps; ++i) {
427  s.input_gpu = l.h_gpu;
428  forward_connected_layer_gpu(wf, s);
429  forward_connected_layer_gpu(wi, s);
430  forward_connected_layer_gpu(wg, s);
431  forward_connected_layer_gpu(wo, s);
432 
433  s.input_gpu = state.input_gpu;
434  forward_connected_layer_gpu(uf, s);
435  forward_connected_layer_gpu(ui, s);
436  forward_connected_layer_gpu(ug, s);
437  forward_connected_layer_gpu(uo, s);
438 
439  copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
440  axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
441 
442  copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
443  axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
444 
445  copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
446  axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
447 
448  copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
449  axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
450 
451  activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
452  activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
453  activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);
454  activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
455 
456  copy_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
457  mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
458  mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
459  axpy_gpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
460 
461  copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);
462  activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
463  mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
464 
465  copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
466  copy_gpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
467 
468  state.input_gpu += l.inputs*l.batch;
469  l.output_gpu += l.outputs*l.batch;
470  l.cell_gpu += l.outputs*l.batch;
471 
472  increment_layer(&wf, 1);
473  increment_layer(&wi, 1);
474  increment_layer(&wg, 1);
475  increment_layer(&wo, 1);
476 
477  increment_layer(&uf, 1);
478  increment_layer(&ui, 1);
479  increment_layer(&ug, 1);
480  increment_layer(&uo, 1);
481  }
482 }
483 
484 void backward_lstm_layer_gpu(layer l, network state)
485 {
486  network s = { 0 };
487  s.train = state.train;
488  int i;
489  layer wf = *(l.wf);
490  layer wi = *(l.wi);
491  layer wg = *(l.wg);
492  layer wo = *(l.wo);
493 
494  layer uf = *(l.uf);
495  layer ui = *(l.ui);
496  layer ug = *(l.ug);
497  layer uo = *(l.uo);
498 
499  increment_layer(&wf, l.steps - 1);
500  increment_layer(&wi, l.steps - 1);
501  increment_layer(&wg, l.steps - 1);
502  increment_layer(&wo, l.steps - 1);
503 
504  increment_layer(&uf, l.steps - 1);
505  increment_layer(&ui, l.steps - 1);
506  increment_layer(&ug, l.steps - 1);
507  increment_layer(&uo, l.steps - 1);
508 
509  state.input_gpu += l.inputs*l.batch*(l.steps - 1);
510  if (state.delta_gpu) state.delta_gpu += l.inputs*l.batch*(l.steps - 1);
511 
512  l.output_gpu += l.outputs*l.batch*(l.steps - 1);
513  l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
514  l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
515 
516  for (i = l.steps - 1; i >= 0; --i) {
517  if (i != 0) copy_gpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
518  copy_gpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
519  if (i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
520  copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
521 
522  l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
523 
524  copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
525  axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
526 
527  copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
528  axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
529 
530  copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
531  axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
532 
533  copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
534  axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
535 
536  activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
537  activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
538  activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);
539  activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
540 
541  copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
542 
543  copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
544  activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);
545 
546  copy_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
547  mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);
548 
549  gradient_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
550  axpy_gpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);
551 
552  copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
553  activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);
554  mul_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
555  gradient_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
556  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
557  s.input_gpu = l.prev_state_gpu;
558  s.delta_gpu = l.dh_gpu;
559  backward_connected_layer_gpu(wo, s);
560 
561  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
562  s.input_gpu = state.input_gpu;
563  s.delta_gpu = state.delta_gpu;
564  backward_connected_layer_gpu(uo, s);
565 
566  copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
567  mul_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
568  gradient_array_gpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
569  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
570  s.input_gpu = l.prev_state_gpu;
571  s.delta_gpu = l.dh_gpu;
572  backward_connected_layer_gpu(wg, s);
573 
574  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
575  s.input_gpu = state.input_gpu;
576  s.delta_gpu = state.delta_gpu;
577  backward_connected_layer_gpu(ug, s);
578 
579  copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
580  mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
581  gradient_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
582  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
583  s.input_gpu = l.prev_state_gpu;
584  s.delta_gpu = l.dh_gpu;
585  backward_connected_layer_gpu(wi, s);
586 
587  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
588  s.input_gpu = state.input_gpu;
589  s.delta_gpu = state.delta_gpu;
590  backward_connected_layer_gpu(ui, s);
591 
592  copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
593  mul_gpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
594  gradient_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
595  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
596  s.input_gpu = l.prev_state_gpu;
597  s.delta_gpu = l.dh_gpu;
598  backward_connected_layer_gpu(wf, s);
599 
600  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
601  s.input_gpu = state.input_gpu;
602  s.delta_gpu = state.delta_gpu;
603  backward_connected_layer_gpu(uf, s);
604 
605  copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
606  mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
607  copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
608 
609  state.input_gpu -= l.inputs*l.batch;
610  if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch;
611  l.output_gpu -= l.outputs*l.batch;
612  l.cell_gpu -= l.outputs*l.batch;
613  l.delta_gpu -= l.outputs*l.batch;
614 
615  increment_layer(&wf, -1);
616  increment_layer(&wi, -1);
617  increment_layer(&wg, -1);
618  increment_layer(&wo, -1);
619 
620  increment_layer(&uf, -1);
621  increment_layer(&ui, -1);
622  increment_layer(&ug, -1);
623  increment_layer(&uo, -1);
624  }
625 }
626 #endif
Definition: darknet.h:82
int steps
Definition: darknet.h:157
void update_connected_layer(layer l, update_args a)
float * o_cpu
Definition: darknet.h:289
struct layer * wg
Definition: darknet.h:332
void(* update)(struct layer, update_args)
Definition: darknet.h:125
void(* forward_gpu)(struct layer, struct network)
Definition: darknet.h:126
void(* backward_gpu)(struct layer, struct network)
Definition: darknet.h:127
float * x
Definition: darknet.h:261
void axpy_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void(* update_gpu)(struct layer, update_args)
Definition: darknet.h:128
float * temp3_cpu
Definition: darknet.h:280
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY)
Definition: blas.c:166
float * g_cpu
Definition: darknet.h:288
void(* forward)(struct layer, struct network)
Definition: darknet.h:123
int out_w
Definition: darknet.h:141
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
Definition: activations.c:143
float * delta
Definition: darknet.h:486
float * prev_state_cpu
Definition: darknet.h:276
int out_c
Definition: darknet.h:141
float * cell_cpu
Definition: darknet.h:285
void mul_gpu(int N, float *X, int INCX, float *Y, int INCY)
void fill_gpu(int N, float ALPHA, float *X, int INCX)
float * dh_cpu
Definition: darknet.h:282
struct layer * ui
Definition: darknet.h:329
void forward_connected_layer(layer l, network net)
float * h_cpu
Definition: darknet.h:275
struct layer * wf
Definition: darknet.h:328
int batch_normalize
Definition: darknet.h:129
void fill_cpu(int N, float ALPHA, float *X, int INCX)
Definition: blas.c:190
float * state
Definition: darknet.h:223
int train
Definition: darknet.h:488
float * delta
Definition: darknet.h:245
int out_h
Definition: darknet.h:141
int inputs
Definition: darknet.h:134
struct layer * wi
Definition: darknet.h:330
Definition: darknet.h:57
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
Definition: blas.c:178
float * x_norm
Definition: darknet.h:262
int batch
Definition: darknet.h:131
float * output
Definition: darknet.h:246
float * dc_cpu
Definition: darknet.h:291
float * i_cpu
Definition: darknet.h:287
void copy_gpu(int N, float *X, int INCX, float *Y, int INCY)
void update_lstm_layer(layer l, update_args a)
Definition: lstm_layer.c:144
struct layer * ug
Definition: darknet.h:331
float * prev_cell_cpu
Definition: darknet.h:284
void activate_array(float *x, const int n, const ACTIVATION a)
Definition: activations.c:100
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
Definition: blas.c:226
struct layer * uf
Definition: darknet.h:327
float * temp_cpu
Definition: darknet.h:278
layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
float * temp2_cpu
Definition: darknet.h:279
void backward_lstm_layer(layer l, network state)
Definition: lstm_layer.c:242
LAYER_TYPE type
Definition: darknet.h:120
float * input
Definition: darknet.h:484
void activate_array_gpu(float *x, int n, ACTIVATION a)
float * f_cpu
Definition: darknet.h:286
layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
Definition: lstm_layer.c:29
int outputs
Definition: darknet.h:135
void forward_lstm_layer(layer l, network state)
Definition: lstm_layer.c:156
struct layer * uo
Definition: darknet.h:325
float * c_cpu
Definition: darknet.h:290
struct layer * wo
Definition: darknet.h:326
void backward_connected_layer(layer l, network net)
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
Definition: darknet.h:119
Definition: darknet.h:57