13 static void increment_layer(
layer *l,
int steps)
31 fprintf(stderr,
"LSTM Layer: %d inputs, %d outputs\n", inputs, outputs);
32 batch = batch / steps;
40 fprintf(stderr,
"\t\t");
45 fprintf(stderr,
"\t\t");
50 fprintf(stderr,
"\t\t");
55 fprintf(stderr,
"\t\t");
60 fprintf(stderr,
"\t\t");
65 fprintf(stderr,
"\t\t");
70 fprintf(stderr,
"\t\t");
75 fprintf(stderr,
"\t\t");
82 l.
output = calloc(outputs*batch*steps,
sizeof(
float));
83 l.
state = calloc(outputs*batch,
sizeof(
float));
90 l.
cell_cpu = calloc(batch*outputs*steps,
sizeof(
float));
92 l.
f_cpu = calloc(batch*outputs,
sizeof(
float));
93 l.
i_cpu = calloc(batch*outputs,
sizeof(
float));
94 l.
g_cpu = calloc(batch*outputs,
sizeof(
float));
95 l.
o_cpu = calloc(batch*outputs,
sizeof(
float));
96 l.
c_cpu = calloc(batch*outputs,
sizeof(
float));
97 l.
h_cpu = calloc(batch*outputs,
sizeof(
float));
98 l.
temp_cpu = calloc(batch*outputs,
sizeof(
float));
99 l.
temp2_cpu = calloc(batch*outputs,
sizeof(
float));
100 l.
temp3_cpu = calloc(batch*outputs,
sizeof(
float));
101 l.
dc_cpu = calloc(batch*outputs,
sizeof(
float));
102 l.
dh_cpu = calloc(batch*outputs,
sizeof(
float));
109 l.output_gpu = cuda_make_array(0, batch*outputs*steps);
110 l.delta_gpu = cuda_make_array(0, batch*l.
outputs*steps);
112 l.prev_state_gpu = cuda_make_array(0, batch*outputs);
113 l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
114 l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
116 l.f_gpu = cuda_make_array(0, batch*outputs);
117 l.i_gpu = cuda_make_array(0, batch*outputs);
118 l.g_gpu = cuda_make_array(0, batch*outputs);
119 l.o_gpu = cuda_make_array(0, batch*outputs);
120 l.c_gpu = cuda_make_array(0, batch*outputs);
121 l.h_gpu = cuda_make_array(0, batch*outputs);
122 l.temp_gpu = cuda_make_array(0, batch*outputs);
123 l.temp2_gpu = cuda_make_array(0, batch*outputs);
124 l.temp3_gpu = cuda_make_array(0, batch*outputs);
125 l.dc_gpu = cuda_make_array(0, batch*outputs);
126 l.dh_gpu = cuda_make_array(0, batch*outputs);
128 cudnnSetTensor4dDescriptor(l.
wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
wf->
out_c, l.
wf->
out_h, l.
wf->
out_w);
129 cudnnSetTensor4dDescriptor(l.
wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
wi->
out_c, l.
wi->
out_h, l.
wi->
out_w);
130 cudnnSetTensor4dDescriptor(l.
wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
wg->
out_c, l.
wg->
out_h, l.
wg->
out_w);
131 cudnnSetTensor4dDescriptor(l.
wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
wo->
out_c, l.
wo->
out_h, l.
wo->
out_w);
133 cudnnSetTensor4dDescriptor(l.
uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
uf->
out_c, l.
uf->
out_h, l.
uf->
out_w);
134 cudnnSetTensor4dDescriptor(l.
ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
ui->
out_c, l.
ui->
out_h, l.
ui->
out_w);
135 cudnnSetTensor4dDescriptor(l.
ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
ug->
out_c, l.
ug->
out_h, l.
ug->
out_w);
136 cudnnSetTensor4dDescriptor(l.
uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.
uo->
out_c, l.
uo->
out_h, l.
uo->
out_w);
184 for (i = 0; i < l.
steps; ++i) {
230 increment_layer(&wf, 1);
231 increment_layer(&wi, 1);
232 increment_layer(&wg, 1);
233 increment_layer(&wo, 1);
235 increment_layer(&uf, 1);
236 increment_layer(&ui, 1);
237 increment_layer(&ug, 1);
238 increment_layer(&uo, 1);
257 increment_layer(&wf, l.
steps - 1);
258 increment_layer(&wi, l.
steps - 1);
259 increment_layer(&wg, l.
steps - 1);
260 increment_layer(&wo, l.
steps - 1);
262 increment_layer(&uf, l.
steps - 1);
263 increment_layer(&ui, l.
steps - 1);
264 increment_layer(&ug, l.
steps - 1);
265 increment_layer(&uo, l.
steps - 1);
274 for (i = l.
steps - 1; i >= 0; --i) {
373 increment_layer(&wf, -1);
374 increment_layer(&wi, -1);
375 increment_layer(&wg, -1);
376 increment_layer(&wo, -1);
378 increment_layer(&uf, -1);
379 increment_layer(&ui, -1);
380 increment_layer(&ug, -1);
381 increment_layer(&uo, -1);
388 update_connected_layer_gpu(*(l.
wf), a);
389 update_connected_layer_gpu(*(l.
wi), a);
390 update_connected_layer_gpu(*(l.
wg), a);
391 update_connected_layer_gpu(*(l.
wo), a);
392 update_connected_layer_gpu(*(l.
uf), a);
393 update_connected_layer_gpu(*(l.
ui), a);
394 update_connected_layer_gpu(*(l.
ug), a);
395 update_connected_layer_gpu(*(l.
uo), a);
426 for (i = 0; i < l.
steps; ++i) {
427 s.input_gpu = l.h_gpu;
428 forward_connected_layer_gpu(wf, s);
429 forward_connected_layer_gpu(wi, s);
430 forward_connected_layer_gpu(wg, s);
431 forward_connected_layer_gpu(wo, s);
433 s.input_gpu = state.input_gpu;
434 forward_connected_layer_gpu(uf, s);
435 forward_connected_layer_gpu(ui, s);
436 forward_connected_layer_gpu(ug, s);
437 forward_connected_layer_gpu(uo, s);
472 increment_layer(&wf, 1);
473 increment_layer(&wi, 1);
474 increment_layer(&wg, 1);
475 increment_layer(&wo, 1);
477 increment_layer(&uf, 1);
478 increment_layer(&ui, 1);
479 increment_layer(&ug, 1);
480 increment_layer(&uo, 1);
499 increment_layer(&wf, l.
steps - 1);
500 increment_layer(&wi, l.
steps - 1);
501 increment_layer(&wg, l.
steps - 1);
502 increment_layer(&wo, l.
steps - 1);
504 increment_layer(&uf, l.
steps - 1);
505 increment_layer(&ui, l.
steps - 1);
506 increment_layer(&ug, l.
steps - 1);
507 increment_layer(&uo, l.
steps - 1);
516 for (i = l.
steps - 1; i >= 0; --i) {
522 l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.
outputs*l.
batch;
557 s.input_gpu = l.prev_state_gpu;
558 s.delta_gpu = l.dh_gpu;
559 backward_connected_layer_gpu(wo, s);
562 s.input_gpu = state.input_gpu;
563 s.delta_gpu = state.delta_gpu;
564 backward_connected_layer_gpu(uo, s);
570 s.input_gpu = l.prev_state_gpu;
571 s.delta_gpu = l.dh_gpu;
572 backward_connected_layer_gpu(wg, s);
575 s.input_gpu = state.input_gpu;
576 s.delta_gpu = state.delta_gpu;
577 backward_connected_layer_gpu(ug, s);
583 s.input_gpu = l.prev_state_gpu;
584 s.delta_gpu = l.dh_gpu;
585 backward_connected_layer_gpu(wi, s);
588 s.input_gpu = state.input_gpu;
589 s.delta_gpu = state.delta_gpu;
590 backward_connected_layer_gpu(ui, s);
596 s.input_gpu = l.prev_state_gpu;
597 s.delta_gpu = l.dh_gpu;
598 backward_connected_layer_gpu(wf, s);
601 s.input_gpu = state.input_gpu;
602 s.delta_gpu = state.delta_gpu;
603 backward_connected_layer_gpu(uf, s);
610 if (state.delta_gpu) state.delta_gpu -= l.
inputs*l.
batch;
615 increment_layer(&wf, -1);
616 increment_layer(&wi, -1);
617 increment_layer(&wg, -1);
618 increment_layer(&wo, -1);
620 increment_layer(&uf, -1);
621 increment_layer(&ui, -1);
622 increment_layer(&ug, -1);
623 increment_layer(&uo, -1);
void update_connected_layer(layer l, update_args a)
void(* update)(struct layer, update_args)
void(* forward_gpu)(struct layer, struct network)
void(* backward_gpu)(struct layer, struct network)
void axpy_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void(* update_gpu)(struct layer, update_args)
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY)
void(* forward)(struct layer, struct network)
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
void mul_gpu(int N, float *X, int INCX, float *Y, int INCY)
void fill_gpu(int N, float ALPHA, float *X, int INCX)
void forward_connected_layer(layer l, network net)
void fill_cpu(int N, float ALPHA, float *X, int INCX)
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void copy_gpu(int N, float *X, int INCX, float *Y, int INCY)
void update_lstm_layer(layer l, update_args a)
void activate_array(float *x, const int n, const ACTIVATION a)
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
void backward_lstm_layer(layer l, network state)
void activate_array_gpu(float *x, int n, ACTIVATION a)
layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
void forward_lstm_layer(layer l, network state)
void backward_connected_layer(layer l, network net)
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)