1 // nnet3/nnet-simple-component.h
3 // Copyright 2011-2013 Karel Vesely
4 // 2012-2015 Johns Hopkins University (author: Daniel Povey)
5 // 2013 Xiaohui Zhang
6 // 2014-2015 Vijayaditya Peddinti
7 // 2014-2015 Guoguo Chen
8 // 2015 Daniel Galvez
9 // 2015 Tom Ko
11 // See ../../COPYING for clarification regarding multiple authors
12 //
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 //
17 // http://www.apache.org/licenses/LICENSE-2.0
18 //
19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
22 // MERCHANTABLITY OR NON-INFRINGEMENT.
23 // See the Apache 2 License for the specific language governing permissions and
24 // limitations under the License.
26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
29 #include "nnet3/nnet-common.h"
30 #include "nnet3/nnet-component-itf.h"
31 #include "nnet3/natural-gradient-online.h"
32 #include <iostream>
34 namespace kaldi {
35 namespace nnet3 {
37 /// @file nnet-simple-component.h
38 /// This file contains declarations of components that are "simple", meaning
39 /// they don't care about the indexes they are operating on, produce one
40 /// output for one input, and return the kSimpleComponent flag in their
41 /// Properties(): for example, tanh and affine components. In
42 /// nnet-general-component.h there are components that don't fit this pattern.
44 // This "nnet3" version of the p-norm component only supports the 2-norm.
45 class PnormComponent: public Component {
46 public:
47 void Init(int32 input_dim, int32 output_dim);
48 explicit PnormComponent(int32 input_dim, int32 output_dim) {
49 Init(input_dim, output_dim);
50 }
51 virtual int32 Properties() const {
52 return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
53 }
54 PnormComponent(): input_dim_(0), output_dim_(0) { }
55 virtual std::string Type() const { return "PnormComponent"; }
56 virtual void InitFromConfig(ConfigLine *cfl);
57 virtual int32 InputDim() const { return input_dim_; }
58 virtual int32 OutputDim() const { return output_dim_; }
59 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
60 const CuMatrixBase<BaseFloat> &in,
61 CuMatrixBase<BaseFloat> *out) const;
62 virtual void Backprop(const std::string &debug_info,
63 const ComponentPrecomputedIndexes *indexes,
64 const CuMatrixBase<BaseFloat> &in_value,
65 const CuMatrixBase<BaseFloat> &out_value,
66 const CuMatrixBase<BaseFloat> &out_deriv,
67 Component *to_update,
68 CuMatrixBase<BaseFloat> *in_deriv) const;
69 virtual Component* Copy() const { return new PnormComponent(input_dim_,
70 output_dim_); }
72 virtual void Read(std::istream &is, bool binary); // This Read function
73 // requires that the Component has the correct type.
75 /// Write component to stream
76 virtual void Write(std::ostream &os, bool binary) const;
78 protected:
79 int32 input_dim_;
80 int32 output_dim_;
81 };
83 // This component randomly zeros dropout_proportion of the input
84 // and the derivatives are backpropagated through the nonzero inputs.
85 // Typically this component used during training but not in test time.
86 // The idea is described under the name Dropout, in the paper
87 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
88 class DropoutComponent : public RandomComponent {
89 public:
90 void Init(int32 dim, BaseFloat dropout_proportion = 0.0);
92 DropoutComponent(int32 dim, BaseFloat dropout = 0.0) { Init(dim, dropout); }
94 DropoutComponent(): dim_(0), dropout_proportion_(0.0) { }
96 virtual int32 Properties() const {
97 return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
98 }
99 virtual std::string Type() const { return "DropoutComponent"; }
101 virtual void InitFromConfig(ConfigLine *cfl);
103 virtual int32 InputDim() const { return dim_; }
105 virtual int32 OutputDim() const { return dim_; }
107 virtual void Read(std::istream &is, bool binary);
109 // Write component to stream
110 virtual void Write(std::ostream &os, bool binary) const;
112 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
113 const CuMatrixBase<BaseFloat> &in,
114 CuMatrixBase<BaseFloat> *out) const;
115 virtual void Backprop(const std::string &debug_info,
116 const ComponentPrecomputedIndexes *indexes,
117 const CuMatrixBase<BaseFloat> &in_value,
118 const CuMatrixBase<BaseFloat> &out_value,
119 const CuMatrixBase<BaseFloat> &out_deriv,
120 Component *to_update,
121 CuMatrixBase<BaseFloat> *in_deriv) const;
122 virtual Component* Copy() const { return new DropoutComponent(dim_,
123 dropout_proportion_); }
124 virtual std::string Info() const;
126 void SetDropoutProportion(BaseFloat dropout_proportion) { dropout_proportion_ = dropout_proportion; }
128 private:
129 int32 dim_;
130 /// dropout-proportion is the proportion that is dropped out,
131 /// e.g. if 0.1, we set 10% to zero value.
132 BaseFloat dropout_proportion_;
134 };
136 class ElementwiseProductComponent: public Component {
137 public:
138 void Init(int32 input_dim, int32 output_dim);
139 explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
140 Init(input_dim, output_dim);
141 }
142 virtual int32 Properties() const {
143 return kSimpleComponent|kBackpropNeedsInput;
144 }
145 ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
146 virtual std::string Type() const { return "ElementwiseProductComponent"; }
147 virtual void InitFromConfig(ConfigLine *cfl);
148 virtual int32 InputDim() const { return input_dim_; }
149 virtual int32 OutputDim() const { return output_dim_; }
150 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
151 const CuMatrixBase<BaseFloat> &in,
152 CuMatrixBase<BaseFloat> *out) const;
153 virtual void Backprop(const std::string &debug_info,
154 const ComponentPrecomputedIndexes *indexes,
155 const CuMatrixBase<BaseFloat> &in_value,
156 const CuMatrixBase<BaseFloat> &out_value,
157 const CuMatrixBase<BaseFloat> &out_deriv,
158 Component *to_update,
159 CuMatrixBase<BaseFloat> *in_deriv) const;
160 virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
161 output_dim_); }
163 virtual void Read(std::istream &is, bool binary); // This Read function
164 // requires that the Component has the correct type.
166 /// Write component to stream
167 virtual void Write(std::ostream &os, bool binary) const;
169 protected:
170 int32 input_dim_;
171 int32 output_dim_;
172 };
174 class NormalizeComponent: public Component {
175 public:
176 void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
177 explicit NormalizeComponent(int32 input_dim,
178 BaseFloat target_rms = 1.0,
179 bool add_log_stddev = false) {
180 Init(input_dim, target_rms, add_log_stddev);
181 }
182 explicit NormalizeComponent(const NormalizeComponent &other);
183 // note: there is some special code in NonlinerComponent::Info() that
184 // specifically caters to this class.
185 virtual int32 Properties() const {
186 return (add_log_stddev_ ?
187 kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
188 kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
189 kBackpropAdds|kBackpropInPlace);
190 }
191 NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
192 virtual std::string Type() const { return "NormalizeComponent"; }
193 virtual void InitFromConfig(ConfigLine *cfl);
194 virtual Component* Copy() const { return new NormalizeComponent(*this); }
195 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
196 const CuMatrixBase<BaseFloat> &in,
197 CuMatrixBase<BaseFloat> *out) const;
198 virtual void Backprop(const std::string &debug_info,
199 const ComponentPrecomputedIndexes *indexes,
200 const CuMatrixBase<BaseFloat> &in_value,
201 const CuMatrixBase<BaseFloat> &, // out_value
202 const CuMatrixBase<BaseFloat> &out_deriv,
203 Component *to_update,
204 CuMatrixBase<BaseFloat> *in_deriv) const;
206 virtual void Read(std::istream &is, bool binary);
207 virtual void Write(std::ostream &os, bool binary) const;
208 virtual int32 InputDim() const { return input_dim_; }
209 virtual int32 OutputDim() const {
210 return (input_dim_ + (add_log_stddev_ ? 1 : 0));
211 }
212 virtual std::string Info() const;
213 private:
214 NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
215 enum { kExpSquaredNormFloor = -66 };
216 static const BaseFloat kSquaredNormFloor;
217 int32 input_dim_;
218 BaseFloat target_rms_; // The target rms for outputs.
219 // about 0.7e-20. We need a value that's exactly representable in
220 // float and whose inverse square root is also exactly representable
221 // in float (hence, an even power of two).
223 bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
224 // is an extra dimension of the output.
225 };
228 class SigmoidComponent: public NonlinearComponent {
229 public:
230 explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
231 SigmoidComponent() { }
232 virtual std::string Type() const { return "SigmoidComponent"; }
233 virtual int32 Properties() const {
234 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
235 }
236 virtual Component* Copy() const { return new SigmoidComponent(*this); }
237 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
238 const CuMatrixBase<BaseFloat> &in,
239 CuMatrixBase<BaseFloat> *out) const;
240 virtual void Backprop(const std::string &debug_info,
241 const ComponentPrecomputedIndexes *indexes,
242 const CuMatrixBase<BaseFloat> &, //in_value
243 const CuMatrixBase<BaseFloat> &out_value,
244 const CuMatrixBase<BaseFloat> &out_deriv,
245 Component *to_update,
246 CuMatrixBase<BaseFloat> *in_deriv) const;
247 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
248 private:
249 // this function is called from Backprop code and only does something if the
250 // self-repair-scale config value is set.
251 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
252 CuMatrixBase<BaseFloat> *in_deriv,
253 SigmoidComponent *to_update) const;
255 SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
256 };
258 class TanhComponent: public NonlinearComponent {
259 public:
260 explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
261 TanhComponent() { }
262 virtual std::string Type() const { return "TanhComponent"; }
263 virtual Component* Copy() const { return new TanhComponent(*this); }
264 virtual int32 Properties() const {
265 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
266 }
267 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
268 const CuMatrixBase<BaseFloat> &in,
269 CuMatrixBase<BaseFloat> *out) const;
270 virtual void Backprop(const std::string &debug_info,
271 const ComponentPrecomputedIndexes *indexes,
272 const CuMatrixBase<BaseFloat> &, //in_value
273 const CuMatrixBase<BaseFloat> &out_value,
274 const CuMatrixBase<BaseFloat> &out_deriv,
275 Component *to_update,
276 CuMatrixBase<BaseFloat> *in_deriv) const;
277 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
278 private:
279 // this function is called from Backprop code and only does something if the
280 // self-repair-scale config value is set.
281 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
282 CuMatrixBase<BaseFloat> *in_deriv,
283 TanhComponent *to_update) const;
285 TanhComponent &operator = (const TanhComponent &other); // Disallow.
286 };
289 class RectifiedLinearComponent: public NonlinearComponent {
290 public:
291 explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
292 NonlinearComponent(other) { }
293 RectifiedLinearComponent() { }
294 virtual std::string Type() const { return "RectifiedLinearComponent"; }
295 virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
296 virtual int32 Properties() const {
297 return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
298 kStoresStats;
299 }
300 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
301 const CuMatrixBase<BaseFloat> &in,
302 CuMatrixBase<BaseFloat> *out) const;
303 virtual void Backprop(const std::string &debug_info,
304 const ComponentPrecomputedIndexes *indexes,
305 const CuMatrixBase<BaseFloat> &, //in_value
306 const CuMatrixBase<BaseFloat> &out_value,
307 const CuMatrixBase<BaseFloat> &out_deriv,
308 Component *to_update,
309 CuMatrixBase<BaseFloat> *in_deriv) const;
310 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
312 private:
313 // this function is called from Backprop code and only does something if the
314 // self-repair-scale config value is set.
315 void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
316 RectifiedLinearComponent *to_update) const;
318 RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
319 };
321 /**
322 This component is a fixed (non-trainable) nonlinearity that sums its inputs
323 to produce outputs. Currently the only supported configuration is that its
324 input-dim is interpreted as consisting of n blocks, and the output is just a
325 summation over the n blocks, where n = input-dim / output-dim, so for instance
326 output[n] = input[n] + input[block-size + n] + .... .
327 Later if needed we can add a configuration variable that allows you to sum
328 over 'interleaved' input.
329 */
330 class SumReduceComponent: public Component {
331 public:
332 void Init(int32 input_dim, int32 output_dim);
333 explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
334 Init(input_dim, output_dim);
335 }
336 virtual int32 Properties() const {
337 return kSimpleComponent|kLinearInInput;
338 }
339 SumReduceComponent(): input_dim_(0), output_dim_(0) { }
340 virtual std::string Type() const { return "SumReduceComponent"; }
341 virtual void InitFromConfig(ConfigLine *cfl);
342 virtual int32 InputDim() const { return input_dim_; }
343 virtual int32 OutputDim() const { return output_dim_; }
344 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
345 const CuMatrixBase<BaseFloat> &in,
346 CuMatrixBase<BaseFloat> *out) const;
347 virtual void Backprop(const std::string &debug_info,
348 const ComponentPrecomputedIndexes *indexes,
349 const CuMatrixBase<BaseFloat> &, // in_value
350 const CuMatrixBase<BaseFloat> &, // out_value,
351 const CuMatrixBase<BaseFloat> &out_deriv,
352 Component *, // to_update
353 CuMatrixBase<BaseFloat> *in_deriv) const;
354 virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
355 output_dim_); }
357 virtual void Read(std::istream &is, bool binary); // This Read function
358 // requires that the Component has the correct type.
360 /// Write component to stream
361 virtual void Write(std::ostream &os, bool binary) const;
363 protected:
364 int32 input_dim_;
365 int32 output_dim_;
366 };
369 class FixedAffineComponent;
370 class FixedScaleComponent;
371 class PerElementScaleComponent;
372 class PerElementOffsetComponent;
374 // Affine means a linear function plus an offset.
375 // Note: although this class can be instantiated, it also
376 // functions as a base-class for more specialized versions of
377 // AffineComponent.
378 class AffineComponent: public UpdatableComponent {
379 friend class SoftmaxComponent; // Friend declaration relates to mixing up.
380 public:
382 virtual int32 InputDim() const { return linear_params_.NumCols(); }
383 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
385 virtual std::string Info() const;
386 virtual void InitFromConfig(ConfigLine *cfl);
388 AffineComponent() { } // use Init to really initialize.
389 virtual std::string Type() const { return "AffineComponent"; }
390 virtual int32 Properties() const {
391 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
392 kBackpropNeedsInput|kBackpropAdds;
393 }
396 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
397 const CuMatrixBase<BaseFloat> &in,
398 CuMatrixBase<BaseFloat> *out) const;
399 virtual void Backprop(const std::string &debug_info,
400 const ComponentPrecomputedIndexes *indexes,
401 const CuMatrixBase<BaseFloat> &in_value,
402 const CuMatrixBase<BaseFloat> &, // out_value
403 const CuMatrixBase<BaseFloat> &out_deriv,
404 Component *to_update,
405 CuMatrixBase<BaseFloat> *in_deriv) const;
407 virtual void Read(std::istream &is, bool binary);
408 virtual void Write(std::ostream &os, bool binary) const;
410 virtual Component* Copy() const;
413 // Some functions from base-class UpdatableComponent.
414 virtual void Scale(BaseFloat scale);
415 virtual void Add(BaseFloat alpha, const Component &other);
416 virtual void PerturbParams(BaseFloat stddev);
417 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
418 virtual int32 NumParameters() const;
419 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
420 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
422 // Some functions that are specific to this class.
424 // This new function is used when mixing up:
425 virtual void SetParams(const VectorBase<BaseFloat> &bias,
426 const MatrixBase<BaseFloat> &linear);
427 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
428 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
429 explicit AffineComponent(const AffineComponent &other);
430 // The next constructor is used in converting from nnet1.
431 AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
432 const CuVectorBase<BaseFloat> &bias_params,
433 BaseFloat learning_rate);
434 void Init(int32 input_dim, int32 output_dim,
435 BaseFloat param_stddev, BaseFloat bias_stddev);
436 void Init(std::string matrix_filename);
438 // This function resizes the dimensions of the component, setting the
439 // parameters to zero, while leaving any other configuration values the same.
440 virtual void Resize(int32 input_dim, int32 output_dim);
442 // The following functions are used for collapsing multiple layers
443 // together. They return a pointer to a new Component equivalent to
444 // the sequence of two components. We haven't implemented this for
445 // FixedLinearComponent yet.
446 Component *CollapseWithNext(const AffineComponent &next) const ;
447 Component *CollapseWithNext(const FixedAffineComponent &next) const;
448 Component *CollapseWithNext(const FixedScaleComponent &next) const;
449 Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
451 protected:
452 friend class NaturalGradientAffineComponent;
453 // This function Update() is for extensibility; child classes may override
454 // this, e.g. for natural gradient update.
455 virtual void Update(
456 const std::string &debug_info,
457 const CuMatrixBase<BaseFloat> &in_value,
458 const CuMatrixBase<BaseFloat> &out_deriv) {
459 UpdateSimple(in_value, out_deriv);
460 }
461 // UpdateSimple is used when *this is a gradient. Child classes may override
462 // this if needed, but typically won't need to.
463 virtual void UpdateSimple(
464 const CuMatrixBase<BaseFloat> &in_value,
465 const CuMatrixBase<BaseFloat> &out_deriv);
467 const AffineComponent &operator = (const AffineComponent &other); // Disallow.
468 CuMatrix<BaseFloat> linear_params_;
469 CuVector<BaseFloat> bias_params_;
470 };
472 class RepeatedAffineComponent;
474 /// This class implements an affine transform using a block diagonal matrix
475 /// e.g., one whose weight matrix is all zeros except for blocks on the
476 /// diagonal. All these blocks have the same dimensions.
477 /// input-dim: num cols of block diagonal matrix.
478 /// output-dim: num rows of block diagonal matrix.
479 /// num-blocks: number of blocks in diagonal of the matrix.
480 /// num-blocks must divide both input-dim and output-dim
481 class BlockAffineComponent : public UpdatableComponent {
482 public:
483 virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
484 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
486 virtual std::string Info() const;
487 virtual void InitFromConfig(ConfigLine *cfl);
489 BlockAffineComponent() { }
490 virtual std::string Type() const { return "BlockAffineComponent"; }
491 virtual int32 Properties() const {
492 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
493 kBackpropNeedsInput|kBackpropAdds;
494 }
496 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
497 const CuMatrixBase<BaseFloat> &in,
498 CuMatrixBase<BaseFloat> *out) const;
500 virtual void Backprop(const std::string &debug_info,
501 const ComponentPrecomputedIndexes *indexes,
502 const CuMatrixBase<BaseFloat> &in_value,
503 const CuMatrixBase<BaseFloat> &, // out_value
504 const CuMatrixBase<BaseFloat> &out_deriv,
505 Component *to_update,
506 CuMatrixBase<BaseFloat> *in_deriv) const;
508 virtual void Read(std::istream &is, bool binary);
509 virtual void Write(std::ostream &os, bool binary) const;
511 virtual Component* Copy() const;
513 // Functions from base-class UpdatableComponent.
514 virtual void Scale(BaseFloat scale);
515 virtual void Add(BaseFloat alpha, const Component &other);
516 virtual void PerturbParams(BaseFloat stddev);
517 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
518 virtual int32 NumParameters() const;
519 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
520 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
522 // BlockAffine-specific functions.
523 void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
524 BaseFloat param_stddev, BaseFloat bias_mean,
525 BaseFloat bias_stddev);
526 explicit BlockAffineComponent(const BlockAffineComponent &other);
527 explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
528 protected:
529 // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
530 // equal size. The blocks are stored in linear_params_ as
531 // [ M
532 // N
533 // O ] but we actually treat it as the matrix:
534 // [ M 0 0
535 // 0 N 0
536 // 0 0 O ]
537 CuMatrix<BaseFloat> linear_params_;
538 CuVector<BaseFloat> bias_params_;
539 int32 num_blocks_;
540 private:
541 const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
542 };
544 class RepeatedAffineComponent: public UpdatableComponent {
545 public:
547 virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
548 virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
550 virtual std::string Info() const;
551 virtual void InitFromConfig(ConfigLine *cfl);
553 RepeatedAffineComponent() { } // use Init to really initialize.
554 virtual std::string Type() const { return "RepeatedAffineComponent"; }
555 virtual int32 Properties() const {
556 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
557 kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
558 }
559 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
560 const CuMatrixBase<BaseFloat> &in,
561 CuMatrixBase<BaseFloat> *out) const;
562 virtual void Backprop(const std::string &debug_info,
563 const ComponentPrecomputedIndexes *indexes,
564 const CuMatrixBase<BaseFloat> &in_value,
565 const CuMatrixBase<BaseFloat> &, // out_value
566 const CuMatrixBase<BaseFloat> &out_deriv,
567 Component *to_update,
568 CuMatrixBase<BaseFloat> *in_deriv) const;
570 virtual void Read(std::istream &is, bool binary);
571 virtual void Write(std::ostream &os, bool binary) const;
573 virtual Component* Copy() const;
575 // Some functions from base-class UpdatableComponent.
576 virtual void Scale(BaseFloat scale);
577 virtual void Add(BaseFloat alpha, const Component &other);
578 virtual void PerturbParams(BaseFloat stddev);
579 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
580 virtual int32 NumParameters() const;
581 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
582 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
584 // Some functions that are specific to this class.
585 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
586 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
587 explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
589 void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
590 BaseFloat param_stddev, BaseFloat bias_mean,
591 BaseFloat bias_stddev);
592 friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
593 protected:
594 // This function Update(), called from backprop, is broken out for
595 // extensibility to natural gradient update.
596 virtual void Update(
597 const CuMatrixBase<BaseFloat> &in_value,
598 const CuMatrixBase<BaseFloat> &out_deriv);
600 // This function does nothing here but is redefined in child-class
601 // NaturalGradientRepeatedAffineComponent. This help avoid repeated code.
602 virtual void SetNaturalGradientConfigs() { }
604 const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
605 CuMatrix<BaseFloat> linear_params_;
606 CuVector<BaseFloat> bias_params_;
607 int32 num_repeats_;
608 };
610 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
611 public:
612 // Use Init() to really initialize.
613 NaturalGradientRepeatedAffineComponent() { }
615 // Most of the public functions are inherited from RepeatedAffineComponent.
616 virtual std::string Type() const {
617 return "NaturalGradientRepeatedAffineComponent";
618 }
620 virtual Component* Copy() const;
622 // Copy constructor
623 explicit NaturalGradientRepeatedAffineComponent(
624 const NaturalGradientRepeatedAffineComponent &other);
625 private:
626 virtual void Update(
627 const CuMatrixBase<BaseFloat> &in_value,
628 const CuMatrixBase<BaseFloat> &out_deriv);
630 const NaturalGradientRepeatedAffineComponent &operator=(
631 const NaturalGradientRepeatedAffineComponent &other); // Disallow.
633 // Applies the default configuration to preconditioner_in_.
634 virtual void SetNaturalGradientConfigs();
636 // For efficiency reasons we only apply the natural gradient to the input
637 // side, i.e. not to the space of output derivatives-- we believe the input
638 // side is the more important side. We don't make the natural-gradient
639 // configurable; we just give it a reasonable configuration.
640 // Instead of using the individual data-points, for efficiency reasons we use
641 // the distribution of per-minibatch summed derivatives over each dimension of
642 // the output space, as the source for the Fisher matrix.
643 OnlineNaturalGradient preconditioner_in_;
644 };
646 class SoftmaxComponent: public NonlinearComponent {
647 public:
648 explicit SoftmaxComponent(const SoftmaxComponent &other):
649 NonlinearComponent(other) { }
650 SoftmaxComponent() { }
651 virtual std::string Type() const { return "SoftmaxComponent"; }
652 virtual int32 Properties() const {
653 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
654 }
655 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
656 const CuMatrixBase<BaseFloat> &in,
657 CuMatrixBase<BaseFloat> *out) const;
658 virtual void Backprop(const std::string &debug_info,
659 const ComponentPrecomputedIndexes *indexes,
660 const CuMatrixBase<BaseFloat> &in_value,
661 const CuMatrixBase<BaseFloat> &out_value,
662 const CuMatrixBase<BaseFloat> &out_deriv,
663 Component *to_update,
664 CuMatrixBase<BaseFloat> *in_deriv) const;
665 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
667 virtual Component* Copy() const { return new SoftmaxComponent(*this); }
668 private:
669 SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
670 };
672 class LogSoftmaxComponent: public NonlinearComponent {
673 public:
674 explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
675 NonlinearComponent(other) { }
676 LogSoftmaxComponent() { }
677 virtual std::string Type() const { return "LogSoftmaxComponent"; }
678 virtual int32 Properties() const {
679 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
680 }
681 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
682 const CuMatrixBase<BaseFloat> &in,
683 CuMatrixBase<BaseFloat> *out) const;
684 virtual void Backprop(const std::string &debug_info,
685 const ComponentPrecomputedIndexes *indexes,
686 const CuMatrixBase<BaseFloat> &in_value,
687 const CuMatrixBase<BaseFloat> &out_value,
688 const CuMatrixBase<BaseFloat> &out_deriv,
689 Component *to_update,
690 CuMatrixBase<BaseFloat> *in_deriv) const;
692 virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
693 private:
694 LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
695 };
697 /// Keywords: natural gradient descent, NG-SGD, naturalgradient. For
698 /// the top-level of the natural gradient code look here, and also in
699 /// nnet-precondition-online.h.
700 /// NaturalGradientAffineComponent is
701 /// a version of AffineComponent that has a non-(multiple of unit) learning-rate
702 /// matrix. See nnet-precondition-online.h for a description of the technique.
703 /// It is described, under the name Online NG-SGD, in the paper "Parallel
704 /// training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
705 /// workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
706 class NaturalGradientAffineComponent: public AffineComponent {
707 public:
708 virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
709 virtual void Read(std::istream &is, bool binary);
710 virtual void Write(std::ostream &os, bool binary) const;
711 void Init(int32 input_dim, int32 output_dim,
712 BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
713 int32 rank_in, int32 rank_out, int32 update_period,
714 BaseFloat num_samples_history, BaseFloat alpha,
715 BaseFloat max_change_per_sample);
716 void Init(int32 rank_in, int32 rank_out, int32 update_period,
717 BaseFloat num_samples_history,
718 BaseFloat alpha, BaseFloat max_change_per_sample,
719 std::string matrix_filename);
720 // this constructor does not really initialize, use Init() or Read().
721 NaturalGradientAffineComponent();
722 virtual void Resize(int32 input_dim, int32 output_dim);
723 virtual void InitFromConfig(ConfigLine *cfl);
724 virtual std::string Info() const;
725 virtual Component* Copy() const;
726 virtual void Scale(BaseFloat scale);
727 virtual void Add(BaseFloat alpha, const Component &other);
728 // copy constructor
729 explicit NaturalGradientAffineComponent(
730 const NaturalGradientAffineComponent &other);
731 virtual void ZeroStats();
733 private:
734 // disallow assignment operator.
735 NaturalGradientAffineComponent &operator= (
736 const NaturalGradientAffineComponent&);
738 // Configs for preconditioner. The input side tends to be better conditioned ->
739 // smaller rank needed, so make them separately configurable.
740 int32 rank_in_;
741 int32 rank_out_;
742 int32 update_period_;
743 BaseFloat num_samples_history_;
744 BaseFloat alpha_;
746 OnlineNaturalGradient preconditioner_in_;
748 OnlineNaturalGradient preconditioner_out_;
750 // If > 0, max_change_per_sample_ is the maximum amount of parameter
751 // change (in L2 norm) that we allow per sample, averaged over the minibatch.
752 // This was introduced in order to control instability.
753 // Instead of the exact L2 parameter change, for
754 // efficiency purposes we limit a bound on the exact
755 // change. The limit is applied via a constant <= 1.0
756 // for each minibatch, A suitable value might be, for
757 // example, 10 or so; larger if there are more
758 // parameters.
759 BaseFloat max_change_per_sample_;
761 // update_count_ records how many updates we have done.
762 double update_count_;
764 // active_scaling_count_ records how many updates we have done,
765 // where the scaling factor is active (not 1.0).
766 double active_scaling_count_;
768 // max_change_scale_stats_ records the sum of scaling factors
769 // in each update, so we can compute the averaged scaling factor
770 // in Info().
771 double max_change_scale_stats_;
773 // Sets the configs rank, alpha and eta in the preconditioner objects,
774 // from the class variables.
775 void SetNaturalGradientConfigs();
777 virtual void Update(
778 const std::string &debug_info,
779 const CuMatrixBase<BaseFloat> &in_value,
780 const CuMatrixBase<BaseFloat> &out_deriv);
781 };
784 /// FixedAffineComponent is an affine transform that is supplied
785 /// at network initialization time and is not trainable.
786 class FixedAffineComponent: public Component {
787 public:
788 FixedAffineComponent() { }
789 virtual std::string Type() const { return "FixedAffineComponent"; }
790 virtual std::string Info() const;
792 // Copy constructor from AffineComponent-- can be used when we're done
793 // training a particular part of the model and want to efficiently disable
794 // further training.
795 FixedAffineComponent(const AffineComponent &c);
797 /// matrix should be of size input-dim+1 to output-dim, last col is offset
798 void Init(const CuMatrixBase<BaseFloat> &matrix);
800 // The ConfigLine cfl contains just the option matrix=<string>,
801 // where the string is the filename of a Kaldi-format matrix to read.
802 virtual void InitFromConfig(ConfigLine *cfl);
804 virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
805 virtual int32 InputDim() const { return linear_params_.NumCols(); }
806 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
808 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
809 const CuMatrixBase<BaseFloat> &in,
810 CuMatrixBase<BaseFloat> *out) const;
811 virtual void Backprop(const std::string &debug_info,
812 const ComponentPrecomputedIndexes *indexes,
813 const CuMatrixBase<BaseFloat> &in_value,
814 const CuMatrixBase<BaseFloat> &, // out_value
815 const CuMatrixBase<BaseFloat> &out_deriv,
816 Component *to_update,
817 CuMatrixBase<BaseFloat> *in_deriv) const;
820 virtual Component* Copy() const;
821 virtual void Read(std::istream &is, bool binary);
822 virtual void Write(std::ostream &os, bool binary) const;
824 // Function to provide access to linear_params_.
825 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
826 protected:
827 friend class AffineComponent;
828 CuMatrix<BaseFloat> linear_params_;
829 CuVector<BaseFloat> bias_params_;
831 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
832 };
834 /// SumGroupComponent is used to sum up groups of posteriors.
835 /// It's used to introduce a kind of Gaussian-mixture-model-like
836 /// idea into neural nets. This is basically a degenerate case of
837 /// MixtureProbComponent; we had to implement it separately to
838 /// be efficient for CUDA (we can use this one regardless whether
839 /// we have CUDA or not; it's the normal case we want anyway).
840 ///
841 /// There are two forms of initialization in a config file: one
842 /// where the number of elements are specified for each group
843 /// individually as a vector, and one where only the total input
844 /// dimension and the output dimension (number of groups) is specified.
845 /// The second is used when all groups have the same size.
846 class SumGroupComponent: public Component {
847 public:
848 virtual int32 InputDim() const { return input_dim_; }
849 virtual int32 OutputDim() const { return output_dim_; }
850 void Init(const std::vector<int32> &sizes); // the vector is of the input dim
851 // (>= 1) for each output dim.
852 void Init(int32 input_dim, int32 output_dim);
853 void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
854 // each output-dim, how many
855 // inputs were summed over.
856 virtual void InitFromConfig(ConfigLine *cfl);
857 SumGroupComponent() { }
858 virtual std::string Type() const { return "SumGroupComponent"; }
859 virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
860 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
861 const CuMatrixBase<BaseFloat> &in,
862 CuMatrixBase<BaseFloat> *out) const;
863 virtual void Backprop(const std::string &debug_info,
864 const ComponentPrecomputedIndexes *indexes,
865 const CuMatrixBase<BaseFloat> &in_value,
866 const CuMatrixBase<BaseFloat> &, // out_value
867 const CuMatrixBase<BaseFloat> &out_deriv,
868 Component *to_update,
869 CuMatrixBase<BaseFloat> *in_deriv) const;
870 virtual Component* Copy() const;
871 virtual void Read(std::istream &is, bool binary);
872 virtual void Write(std::ostream &os, bool binary) const;
874 private:
875 KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
876 // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
877 // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
878 CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
879 // index.
880 CuArray<int32> reverse_indexes_; // for each input index, the output index.
881 int32 input_dim_;
882 int32 output_dim_;
883 };
886 /// FixedScaleComponent applies a fixed per-element scale; it's similar
887 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
888 /// model conversion).
889 class FixedScaleComponent: public Component {
890 public:
891 FixedScaleComponent() { }
892 virtual std::string Type() const { return "FixedScaleComponent"; }
893 virtual std::string Info() const;
894 virtual int32 Properties() const {
895 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
896 }
898 void Init(const CuVectorBase<BaseFloat> &scales);
900 // The ConfigLine cfl contains only the option scales=<string>,
901 // where the string is the filename of a Kaldi-format matrix to read.
902 virtual void InitFromConfig(ConfigLine *cfl);
904 virtual int32 InputDim() const { return scales_.Dim(); }
905 virtual int32 OutputDim() const { return scales_.Dim(); }
907 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
908 const CuMatrixBase<BaseFloat> &in,
909 CuMatrixBase<BaseFloat> *out) const;
910 virtual void Backprop(const std::string &debug_info,
911 const ComponentPrecomputedIndexes *indexes,
912 const CuMatrixBase<BaseFloat> &, // in_value
913 const CuMatrixBase<BaseFloat> &, // out_value
914 const CuMatrixBase<BaseFloat> &out_deriv,
915 Component *, // to_update
916 CuMatrixBase<BaseFloat> *in_deriv) const;
917 virtual Component* Copy() const;
918 virtual void Read(std::istream &is, bool binary);
919 virtual void Write(std::ostream &os, bool binary) const;
921 protected:
922 friend class AffineComponent; // necessary for collapse
923 CuVector<BaseFloat> scales_;
924 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
925 };
928 /// FixedBiasComponent applies a fixed per-element bias; it's similar
929 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
930 /// model conversion.
931 class FixedBiasComponent: public Component {
932 public:
933 FixedBiasComponent() { }
934 virtual std::string Type() const { return "FixedBiasComponent"; }
935 virtual std::string Info() const;
937 virtual int32 Properties() const {
938 return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
939 }
941 void Init(const CuVectorBase<BaseFloat> &scales);
943 // The ConfigLine cfl contains only the option bias=<string>,
944 // where the string is the filename of a Kaldi-format matrix to read.
945 virtual void InitFromConfig(ConfigLine *cfl);
946 virtual int32 InputDim() const { return bias_.Dim(); }
947 virtual int32 OutputDim() const { return bias_.Dim(); }
948 using Component::Propagate; // to avoid name hiding
949 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
950 const CuMatrixBase<BaseFloat> &in,
951 CuMatrixBase<BaseFloat> *out) const;
952 virtual void Backprop(const std::string &debug_info,
953 const ComponentPrecomputedIndexes *indexes,
954 const CuMatrixBase<BaseFloat> &, // in_value,
955 const CuMatrixBase<BaseFloat> &, // out_value
956 const CuMatrixBase<BaseFloat> &out_deriv,
957 Component *, // to_update
958 CuMatrixBase<BaseFloat> *in_deriv) const;
959 virtual Component* Copy() const;
960 virtual void Read(std::istream &is, bool binary);
961 virtual void Write(std::ostream &os, bool binary) const;
963 protected:
964 CuVector<BaseFloat> bias_;
965 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
966 };
968 // NoOpComponent just duplicates its input. We don't anticipate this being used
969 // very often, but it may sometimes make your life easier
970 class NoOpComponent: public NonlinearComponent {
971 public:
972 explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
973 NoOpComponent() { }
974 virtual std::string Type() const { return "NoOpComponent"; }
975 virtual int32 Properties() const {
976 return kSimpleComponent|kLinearInInput|kPropagateInPlace;
977 }
978 virtual Component* Copy() const { return new NoOpComponent(*this); }
979 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
980 const CuMatrixBase<BaseFloat> &in,
981 CuMatrixBase<BaseFloat> *out) const;
982 virtual void Backprop(const std::string &debug_info,
983 const ComponentPrecomputedIndexes *indexes,
984 const CuMatrixBase<BaseFloat> &, //in_value
985 const CuMatrixBase<BaseFloat> &, // out_value,
986 const CuMatrixBase<BaseFloat> &out_deriv,
987 Component *to_update,
988 CuMatrixBase<BaseFloat> *in_deriv) const;
989 private:
990 NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
991 };
993 // ClipGradientComponent just duplicates its input, but clips gradients
994 // during backpropagation if they cross a predetermined threshold.
995 // This component will be used to prevent gradient explosion problem in
996 // recurrent neural networks
997 class ClipGradientComponent: public Component {
998 public:
999 ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1000 bool norm_based_clipping,
1001 BaseFloat self_repair_clipped_proportion_threshold,
1002 BaseFloat self_repair_target,
1003 BaseFloat self_repair_scale,
1004 int32 num_clipped,
1005 int32 count,
1006 int32 num_self_repaired,
1007 int32 num_backpropped) {
1008 Init(dim, clipping_threshold, norm_based_clipping,
1009 self_repair_clipped_proportion_threshold,
1010 self_repair_target,
1011 self_repair_scale,
1012 num_clipped, count,
1013 num_self_repaired, num_backpropped);}
1015 ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1016 norm_based_clipping_(false),
1017 self_repair_clipped_proportion_threshold_(1.0),
1018 self_repair_target_(0.0),
1019 self_repair_scale_(0.0),
1020 num_clipped_(0), count_(0),
1021 num_self_repaired_(0), num_backpropped_(0) { }
1023 virtual int32 InputDim() const { return dim_; }
1024 virtual int32 OutputDim() const { return dim_; }
1025 virtual void InitFromConfig(ConfigLine *cfl);
1026 void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1027 BaseFloat self_repair_clipped_proportion_threshold,
1028 BaseFloat self_repair_target,
1029 BaseFloat self_repair_scale,
1030 int32 num_clipped, int32 count,
1031 int32 num_self_repaired, int32 num_backpropped);
1033 virtual std::string Type() const { return "ClipGradientComponent"; }
1035 virtual int32 Properties() const {
1036 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1037 kBackpropNeedsInput;
1038 }
1040 virtual void ZeroStats();
1042 virtual Component* Copy() const {
1043 return new ClipGradientComponent(dim_,
1044 clipping_threshold_,
1045 norm_based_clipping_,
1046 self_repair_clipped_proportion_threshold_,
1047 self_repair_target_,
1048 self_repair_scale_,
1049 num_clipped_,
1050 count_,
1051 num_self_repaired_,
1052 num_backpropped_);}
1054 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1055 const CuMatrixBase<BaseFloat> &in,
1056 CuMatrixBase<BaseFloat> *out) const;
1057 virtual void Backprop(const std::string &debug_info,
1058 const ComponentPrecomputedIndexes *indexes,
1059 const CuMatrixBase<BaseFloat> &in_value,
1060 const CuMatrixBase<BaseFloat> &, // out_value,
1061 const CuMatrixBase<BaseFloat> &out_deriv,
1062 Component *to_update,
1063 CuMatrixBase<BaseFloat> *in_deriv) const;
1065 virtual void Scale(BaseFloat scale);
1066 virtual void Add(BaseFloat alpha, const Component &other);
1067 virtual void Read(std::istream &is, bool binary); // This Read function
1068 // requires that the Component has the correct type.
1069 /// Write component to stream
1070 virtual void Write(std::ostream &os, bool binary) const;
1071 virtual std::string Info() const;
1072 virtual ~ClipGradientComponent() {
1073 if (num_self_repaired_ > 0)
1074 KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1075 << ")'s self-repair was activated " << num_self_repaired_
1076 << " time(s) out of " << num_backpropped_
1077 << " times of calling Backprop() in this training job.";
1078 }
1079 private:
1080 int32 dim_; // input/output dimension
1081 BaseFloat clipping_threshold_; // threshold to be used for clipping
1082 // could correspond to max-row-norm (if
1083 // norm_based_clipping_ == true) or
1084 // max-absolute-value (otherwise)
1085 bool norm_based_clipping_; // if true the max-row-norm will be clipped
1086 // else element-wise absolute value clipping is
1087 // done
1089 // some configuration values relating to self-repairing.
1090 BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1091 // clipped-proportion
1092 // for self-repair to be
1093 // activated
1094 BaseFloat self_repair_target_; // the target value towards which self-repair
1095 // is trying to set for in-deriv
1096 BaseFloat self_repair_scale_; // constant scaling the self-repair vector
1097 std::string debug_info_; // component-node name, used in the destructor to
1098 // print out stats of self-repair
1100 // this function is called from Backprop code, and only does something if the
1101 // self-repair-scale config value is set and the current clipped proportion
1102 // exceeds the threshold. What it does is to add a term to in-deriv that
1103 // forces the input to the ClipGradientComponent to be close to some small
1104 // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1105 // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1106 // small, the parameters on the path will also tend to be small, which may
1107 // help tamp down the divergence caused by gradient explosion.
1108 void RepairGradients(const std::string &debug_info,
1109 const CuMatrixBase<BaseFloat> &in_value,
1110 CuMatrixBase<BaseFloat> *in_deriv,
1111 ClipGradientComponent *to_update) const;
1113 ClipGradientComponent &operator =
1114 (const ClipGradientComponent &other); // Disallow.
1116 protected:
1117 // variables to store stats
1118 // An element corresponds to rows of derivative matrix, when
1119 // norm_based_clipping_ is true,
1120 // else it corresponds to each element of the derivative matrix
1121 // Note: no stats are stored when norm_based_clipping_ is false
1122 int32 num_clipped_; // number of elements which were clipped
1123 int32 count_; // number of elements which were processed
1124 int32 num_self_repaired_; // number of times self-repair is activated
1125 int32 num_backpropped_; //number of times backprop is called
1127 };
1129 /** PermuteComponent changes the order of the columns (i.e. the feature or
1130 activation dimensions). Output dimension i is mapped to input dimension
1131 column_map_[i], so it's like doing:
1132 for each row:
1133 for each feature/activation dimension i:
1134 output(row, i) = input(row, column_map_[i]).
1136 */
1137 class PermuteComponent: public Component {
1138 public:
1139 PermuteComponent() {}
1140 PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1142 virtual int32 InputDim() const { return column_map_.Dim(); }
1143 virtual int32 OutputDim() const { return column_map_.Dim(); }
1144 virtual void InitFromConfig(ConfigLine *cfl);
1145 void Init(const std::vector<int32> &column_map);
1147 virtual std::string Type() const { return "PermuteComponent"; }
1149 virtual int32 Properties() const {
1150 return kSimpleComponent|kLinearInInput;
1151 }
1153 virtual void ZeroStats() {}
1155 virtual Component* Copy() const;
1157 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1158 const CuMatrixBase<BaseFloat> &in,
1159 CuMatrixBase<BaseFloat> *out) const;
1160 virtual void Backprop(const std::string &debug_info,
1161 const ComponentPrecomputedIndexes *indexes,
1162 const CuMatrixBase<BaseFloat> &, //in_value
1163 const CuMatrixBase<BaseFloat> &, // out_value,
1164 const CuMatrixBase<BaseFloat> &out_deriv,
1165 Component *to_update,
1166 CuMatrixBase<BaseFloat> *in_deriv) const;
1168 virtual void Scale(BaseFloat scale) {}
1169 virtual void Add(BaseFloat alpha, const Component &other) {}
1170 virtual void Read(std::istream &is, bool binary); // This Read function
1171 // requires that the Component has the correct type.
1172 /// Write component to stream
1173 virtual void Write(std::ostream &os, bool binary) const;
1174 virtual std::string Info() const;
1175 private:
1176 // computes the reverse column map. Must not be called if column_map_.Dim()
1177 // == 0
1178 void ComputeReverseColumnMap();
1179 CuArray<int32> column_map_;
1180 // the following is a derived variable, not written to disk.
1181 // It is used in backprop.
1182 CuArray<int32> reverse_column_map_;
1183 PermuteComponent &operator =
1184 (const PermuteComponent &other); // Disallow.
1185 };
1190 // PerElementScaleComponent scales each dimension of its input with a separate
1191 // trainable scale; it's like a linear component with a diagonal matrix.
1192 class PerElementScaleComponent: public UpdatableComponent {
1193 public:
1194 virtual int32 InputDim() const { return scales_.Dim(); }
1195 virtual int32 OutputDim() const { return scales_.Dim(); }
1197 virtual std::string Info() const;
1198 virtual void InitFromConfig(ConfigLine *cfl);
1200 PerElementScaleComponent() { } // use Init to really initialize.
1201 virtual std::string Type() const { return "PerElementScaleComponent"; }
1202 virtual int32 Properties() const {
1203 return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1204 kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1205 }
1207 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1208 const CuMatrixBase<BaseFloat> &in,
1209 CuMatrixBase<BaseFloat> *out) const;
1210 virtual void Backprop(const std::string &debug_info,
1211 const ComponentPrecomputedIndexes *indexes,
1212 const CuMatrixBase<BaseFloat> &in_value,
1213 const CuMatrixBase<BaseFloat> &, // out_value
1214 const CuMatrixBase<BaseFloat> &out_deriv,
1215 Component *to_update,
1216 CuMatrixBase<BaseFloat> *in_deriv) const;
1218 virtual void Read(std::istream &is, bool binary);
1219 virtual void Write(std::ostream &os, bool binary) const;
1221 virtual Component* Copy() const;
1224 // Some functions from base-class UpdatableComponent.
1225 virtual void Scale(BaseFloat scale);
1226 virtual void Add(BaseFloat alpha, const Component &other);
1227 virtual void PerturbParams(BaseFloat stddev);
1228 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1229 virtual int32 NumParameters() const;
1230 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1231 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1233 // Some functions that are specific to this class.
1234 explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1236 void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1237 void Init(std::string vector_filename);
1239 protected:
1240 friend class AffineComponent; // necessary for collapse
1241 // This function Update() is for extensibility; child classes may override
1242 // this, e.g. for natural gradient update.
1243 virtual void Update(
1244 const std::string &debug_info,
1245 const CuMatrixBase<BaseFloat> &in_value,
1246 const CuMatrixBase<BaseFloat> &out_deriv) {
1247 UpdateSimple(in_value, out_deriv);
1248 }
1249 // UpdateSimple is used when *this is a gradient. Child classes may override
1250 // this if needed, but typically won't need to.
1251 virtual void UpdateSimple(
1252 const CuMatrixBase<BaseFloat> &in_value,
1253 const CuMatrixBase<BaseFloat> &out_deriv);
1255 const PerElementScaleComponent &operator
1256 = (const PerElementScaleComponent &other); // Disallow.
1257 CuVector<BaseFloat> scales_;
1258 };
1261 // PerElementOffsetComponent offsets each dimension of its input with a separate
1262 // trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
1263 class PerElementOffsetComponent: public UpdatableComponent {
1264 public:
1265 virtual int32 InputDim() const { return offsets_.Dim(); }
1266 virtual int32 OutputDim() const { return offsets_.Dim(); }
1268 virtual std::string Info() const;
1269 virtual void InitFromConfig(ConfigLine *cfl);
1271 PerElementOffsetComponent() { } // use Init to really initialize.
1272 virtual std::string Type() const { return "PerElementOffsetComponent"; }
1273 virtual int32 Properties() const {
1274 return kSimpleComponent|kUpdatableComponent|
1275 kBackpropInPlace|kPropagateInPlace;
1276 }
1278 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1279 const CuMatrixBase<BaseFloat> &in,
1280 CuMatrixBase<BaseFloat> *out) const;
1281 virtual void Backprop(const std::string &debug_info,
1282 const ComponentPrecomputedIndexes *indexes,
1283 const CuMatrixBase<BaseFloat> &, // in_value
1284 const CuMatrixBase<BaseFloat> &, // out_value
1285 const CuMatrixBase<BaseFloat> &out_deriv,
1286 Component *to_update,
1287 CuMatrixBase<BaseFloat> *in_deriv) const;
1289 virtual void Read(std::istream &is, bool binary);
1290 virtual void Write(std::ostream &os, bool binary) const;
1292 virtual Component* Copy() const;
1295 // Some functions from base-class UpdatableComponent.
1296 virtual void Scale(BaseFloat scale);
1297 virtual void Add(BaseFloat alpha, const Component &other);
1298 virtual void PerturbParams(BaseFloat stddev);
1299 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1300 virtual int32 NumParameters() const;
1301 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1302 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1304 // Some functions that are specific to this class.
1305 explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1307 void Init(int32 dim, BaseFloat param_mean,
1308 BaseFloat param_stddev);
1309 void Init(std::string vector_filename);
1311 protected:
1312 const PerElementOffsetComponent &operator
1313 = (const PerElementOffsetComponent &other); // Disallow.
1314 CuVector<BaseFloat> offsets_;
1315 };
1318 // ConstantFunctionComponent returns constant function of its input,
1319 // i.e. its output does not depend on its input. It is the same as
1320 // an affine component with the linear term fixed at zero.
1321 // It is optionally trainable, and optionally you can use natural
1322 // gradient. The input is required only because it's more convenient
1323 // to make SimpleComponents [but see ConstantComponent, which requires
1324 // no inputs].
1325 class ConstantFunctionComponent: public UpdatableComponent {
1326 public:
1327 virtual int32 InputDim() const { return input_dim_; }
1328 virtual int32 OutputDim() const { return output_.Dim(); }
1330 virtual std::string Info() const;
1331 // possible parameter values with their defaults:
1332 // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1333 // output-mean=0 output-stddev=0
1334 virtual void InitFromConfig(ConfigLine *cfl);
1336 ConstantFunctionComponent();
1338 ConstantFunctionComponent(const ConstantFunctionComponent &other);
1340 virtual std::string Type() const { return "ConstantFunctionComponent"; }
1341 virtual int32 Properties() const {
1342 return kSimpleComponent|
1343 (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1344 (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1345 kBackpropAdds;
1346 }
1347 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1348 const CuMatrixBase<BaseFloat> &in,
1349 CuMatrixBase<BaseFloat> *out) const;
1350 virtual void Backprop(const std::string &debug_info,
1351 const ComponentPrecomputedIndexes *indexes,
1352 const CuMatrixBase<BaseFloat> &, // in_value
1353 const CuMatrixBase<BaseFloat> &, // out_value
1354 const CuMatrixBase<BaseFloat> &out_deriv,
1355 Component *to_update,
1356 CuMatrixBase<BaseFloat> *in_deriv) const;
1358 virtual void Read(std::istream &is, bool binary);
1359 virtual void Write(std::ostream &os, bool binary) const;
1361 virtual Component* Copy() const;
1363 // Some functions from base-class UpdatableComponent.
1364 virtual void Scale(BaseFloat scale);
1365 virtual void Add(BaseFloat alpha, const Component &other);
1366 virtual void PerturbParams(BaseFloat stddev);
1367 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1368 virtual int32 NumParameters() const;
1369 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1370 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1371 private:
1372 int32 input_dim_;
1373 // the output value-- a vector.
1374 CuVector<BaseFloat> output_;
1376 bool is_updatable_;
1377 // if true, and if updatable, do natural-gradient update.
1378 bool use_natural_gradient_;
1379 OnlineNaturalGradient preconditioner_;
1381 const ConstantFunctionComponent &operator
1382 = (const ConstantFunctionComponent &other); // Disallow.
1383 };
1387 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1388 // it uses a natural gradient update for the per-element scales, and enforces a
1389 // maximum amount of change per minibatch, for stability.
1390 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1391 public:
1393 virtual std::string Info() const;
1395 virtual void InitFromConfig(ConfigLine *cfl);
1397 NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1398 virtual std::string Type() const {
1399 return "NaturalGradientPerElementScaleComponent";
1400 }
1402 virtual void Read(std::istream &is, bool binary);
1403 virtual void Write(std::ostream &os, bool binary) const;
1405 virtual Component* Copy() const;
1407 // Some functions that are specific to this class:
1408 explicit NaturalGradientPerElementScaleComponent(
1409 const NaturalGradientPerElementScaleComponent &other);
1411 void Init(int32 dim, BaseFloat param_mean,
1412 BaseFloat param_stddev, int32 rank, int32 update_period,
1413 BaseFloat num_samples_history, BaseFloat alpha,
1414 BaseFloat max_change_per_minibatch);
1415 void Init(std::string vector_filename,
1416 int32 rank, int32 update_period, BaseFloat num_samples_history,
1417 BaseFloat alpha, BaseFloat max_change_per_minibatch);
1419 private:
1420 // configuration value for imposing max-change...
1421 BaseFloat max_change_per_minibatch_;
1423 // unlike the NaturalGradientAffineComponent, there is only one dimension to
1424 // consider as the parameters are a vector not a matrix, so we only need one
1425 // preconditioner.
1426 // The preconditioner stores its own configuration values; we write and read
1427 // these, but not the preconditioner object itself.
1428 OnlineNaturalGradient preconditioner_;
1430 // Override of the parent-class Update() function, called only
1431 // if this->is_gradient_ = false; this implements the natural
1432 // gradient update.
1433 virtual void Update(
1434 const std::string &debug_info,
1435 const CuMatrixBase<BaseFloat> &in_value,
1436 const CuMatrixBase<BaseFloat> &out_deriv);
1438 const NaturalGradientPerElementScaleComponent &operator
1439 = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1440 };
1442 /**
1443 * ConvolutionalComponent implements 2d-convolution.
1444 * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1445 * 2 dimensions as it has same size as the input along the 3rd dimension.
1446 * Input : A matrix where each row is a vectorized 3D-tensor.
1447 * The 3D tensor has dimensions
1448 * x: (e.g. time)
1449 * y: (e.g. frequency)
1450 * z: (e.g. channels like features/delta/delta-delta)
1451 *
1452 * The component supports input vectorizations of type zyx and yzx.
1453 * The default vectorization type is zyx.
1454 * e.g. for input vectorization of type zyx the input is vectorized by
1455 * spanning axes z, y and x of the tensor in that order.
1456 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1457 * the zyx vectorized input looks like
1458 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1459 *
1460 *
1461 * Output : The output is also a 3D tensor vectorized in the zyx format.
1462 * The channel axis (z) in the output corresponds to the output of
1463 * different filters. The first channel corresponds to the first filter
1464 * i.e., first row of the filter_params_ matrix.
1465 *
1466 * Note: The component has to support yzx input vectorization as the binaries
1467 * like add-deltas generate yz vectorized output. These input vectors are
1468 * concatenated using the Append descriptor across time steps to form a yzx
1469 * vectorized 3D tensor input.
1470 * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1471 *
1472 *
1473 * For information on the hyperparameters and parameters of this component see
1474 * the variable declarations.
1475 *
1476 * Propagation:
1477 * ------------
1478 * Convolution operation consists of a dot-products between the filter tensor
1479 * and input tensor patch, for various shifts of filter tensor along the x and y
1480 * axes input tensor. (Note: there is no shift along z-axis as the filter and
1481 * input tensor have same size along this axis).
1482 *
1483 * For a particular shift (i,j) of the filter tensor
1484 * along input tensor dimensions x and y, the elements of the input tensor which
1485 * overlap with the filter form the input tensor patch. This patch is vectorized
1486 * in zyx format. All the patches corresponding to various samples in the
1487 * mini-batch are stacked into a matrix, where each row corresponds to one
1488 * patch. Let this matrix be represented by X_{i,j}. The dot products with
1489 * various filters are computed simultaneously by computing the matrix product
1490 * with the filter_params_ matrix (W)
1491 * Y_{i,j} = X_{i,j}*W^T.
1492 * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1493 *
1494 * All the matrix products corresponding to various shifts (i,j) of the
1495 * filter tensor are computed simultaneously using the AddMatMatBatched
1496 * call of CuMatrixBase class.
1497 *
1498 * BackPropagation:
1499 * ----------------
1500 * Backpropagation to compute the input derivative (\nabla X_{i,j})
1501 * consists of the a series of matrix products.
1502 * \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1503 * output derivative for a particular shift of the filter.
1504 *
1505 * Once again these matrix products are computed simultaneously.
1506 *
1507 * Update:
1508 * -------
1509 * The weight gradient is computed as
1510 * \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1511 *
1512 */
1513 class ConvolutionComponent: public UpdatableComponent {
1514 public:
1515 enum TensorVectorizationType {
1516 kYzx = 0,
1517 kZyx = 1
1518 };
1520 ConvolutionComponent();
1521 // constructor using another component
1522 ConvolutionComponent(const ConvolutionComponent &component);
1523 // constructor using parameters
1524 ConvolutionComponent(
1525 const CuMatrixBase<BaseFloat> &filter_params,
1526 const CuVectorBase<BaseFloat> &bias_params,
1527 int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1528 int32 filt_x_dim, int32 filt_y_dim,
1529 int32 filt_x_step, int32 filt_y_step,
1530 TensorVectorizationType input_vectorization,
1531 BaseFloat learning_rate);
1533 virtual int32 InputDim() const;
1534 virtual int32 OutputDim() const;
1536 virtual std::string Info() const;
1537 virtual void InitFromConfig(ConfigLine *cfl);
1538 virtual std::string Type() const { return "ConvolutionComponent"; }
1539 virtual int32 Properties() const {
1540 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1541 kBackpropAdds|kPropagateAdds;
1542 }
1544 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1545 const CuMatrixBase<BaseFloat> &in,
1546 CuMatrixBase<BaseFloat> *out) const;
1547 virtual void Backprop(const std::string &debug_info,
1548 const ComponentPrecomputedIndexes *indexes,
1549 const CuMatrixBase<BaseFloat> &in_value,
1550 const CuMatrixBase<BaseFloat> &, // out_value,
1551 const CuMatrixBase<BaseFloat> &out_deriv,
1552 Component *to_update_in,
1553 CuMatrixBase<BaseFloat> *in_deriv) const;
1554 void Update(const std::string &debug_info,
1555 const CuMatrixBase<BaseFloat> &in_value,
1556 const CuMatrixBase<BaseFloat> &out_deriv,
1557 const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1560 virtual void Read(std::istream &is, bool binary);
1561 virtual void Write(std::ostream &os, bool binary) const;
1563 virtual Component* Copy() const;
1565 // Some functions from base-class UpdatableComponent.
1566 virtual void Scale(BaseFloat scale);
1567 virtual void Add(BaseFloat alpha, const Component &other);
1568 virtual void PerturbParams(BaseFloat stddev);
1569 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1570 virtual int32 NumParameters() const;
1571 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1572 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1574 // Some functions that are specific to this class.
1575 void SetParams(const VectorBase<BaseFloat> &bias,
1576 const MatrixBase<BaseFloat> &filter);
1577 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1578 const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1579 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1580 int32 filt_x_dim, int32 filt_y_dim,
1581 int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1582 TensorVectorizationType input_vectorization,
1583 BaseFloat param_stddev, BaseFloat bias_stddev);
1584 // there is no filt_z_dim parameter as the length of the filter along
1585 // z-dimension is same as the input
1586 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1587 int32 filt_x_dim, int32 filt_y_dim,
1588 int32 filt_x_step, int32 filt_y_step,
1589 TensorVectorizationType input_vectorization,
1590 std::string matrix_filename);
1592 // resize the component, setting the parameters to zero, while
1593 // leaving any other configuration values the same
1594 void Resize(int32 input_dim, int32 output_dim);
1596 void Update(const std::string &debug_info,
1597 const CuMatrixBase<BaseFloat> &in_value,
1598 const CuMatrixBase<BaseFloat> &out_deriv);
1601 private:
1602 int32 input_x_dim_; // size of the input along x-axis
1603 // (e.g. number of time steps)
1605 int32 input_y_dim_; // size of input along y-axis
1606 // (e.g. number of mel-frequency bins)
1608 int32 input_z_dim_; // size of input along z-axis
1609 // (e.g. number of channels is 3 if the input has
1610 // features + delta + delta-delta features
1612 int32 filt_x_dim_; // size of the filter along x-axis
1614 int32 filt_y_dim_; // size of the filter along y-axis
1616 // there is no filt_z_dim_ as it is always assumed to be
1617 // the same as input_z_dim_
1619 int32 filt_x_step_; // the number of steps taken along x-axis of input
1620 // before computing the next dot-product
1621 // of filter and input
1623 int32 filt_y_step_; // the number of steps taken along y-axis of input
1624 // before computing the next dot-product of the filter
1625 // and input
1627 // there is no filt_z_step_ as only dot product is possible along this axis
1629 TensorVectorizationType input_vectorization_; // type of vectorization of the
1630 // input 3D tensor. Accepts zyx and yzx formats
1632 CuMatrix<BaseFloat> filter_params_;
1633 // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1634 // where each row in the matrix corresponds to one filter.
1635 // The 3D filter tensor is vectorizedin zyx format.
1636 // The first row of the matrix corresponds to the first filter and so on.
1637 // Keep in mind the vectorization type and order of filters when using file
1638 // based initialization.
1640 CuVector<BaseFloat> bias_params_;
1641 // the filter-specific bias vector (i.e., there is a seperate bias added
1642 // to the output of each filter).
1643 bool is_gradient_;
1645 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1646 CuMatrix<BaseFloat> *patches) const;
1647 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1648 CuMatrixBase<BaseFloat> *in_deriv) const;
1649 const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1650 };
1653 // LstmNonlinearityComponent is a component that implements part of an LSTM, by
1654 // combining together the sigmoids and tanh's, plus some diagonal terms, into
1655 // a single block.
1656 // We will refer to the LSTM formulation used in
1657 //
1658 // Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1659 // by H. Sak et al,
1660 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1661 //
1662 // Suppose the cell dimension is C. Then outside this component, we compute
1663 // the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1664 // matrix multiplication:
1665 //
1666 // i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1667 // f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1668 // c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1669 // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1670 //
1671 // The part of the computation that takes place in this component is as follows.
1672 // Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
1673 // c_{t-1}). Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1674 //
1675 // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1676 //
1677 //
1678 // This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1679 // and w_o.
1680 //
1681 //
1682 // In the forward pass (Propagate), this component computes the following:
1683 //
1684 // i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) (1)
1685 // f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) (2)
1686 // c_t = f_t*c_{t-1} + i_t * Tanh(c_part) (3)
1687 // o_t = Sigmoid(o_part + w_{oc}*c_t) (4)
1688 // m_t = o_t * Tanh(c_t) (5)
1689 // # note: the outputs are just c_t and m_t.
1690 //
1691 // The backprop is as you would think, but for the "self-repair" we need to pass
1692 // in additional vectors (of the same dim as the parameters of the layer) that
1693 // dictate whether or not we add an additional term to the backpropagated
1694 // derivatives. (This term helps force the input to the nonlinearities into the
1695 // range where the derivatives are not too small).
1696 //
1697 // This component stores stats of the same form as are normally stored by the
1698 // StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1699 // activations and derivatives, but this is done inside the Backprop() functions.
1700 // [the StoreStats() functions don't take the input data as an argument, so
1701 // storing this data that way is impossible, and anyway it's more efficient to
1702 // do it as part of backprop.]
1703 class LstmNonlinearityComponent: public UpdatableComponent {
1704 public:
1706 virtual int32 InputDim() const;
1707 virtual int32 OutputDim() const;
1708 virtual std::string Info() const;
1709 virtual void InitFromConfig(ConfigLine *cfl);
1710 LstmNonlinearityComponent() { } // use Init to really initialize.
1711 virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1712 virtual int32 Properties() const {
1713 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1714 }
1716 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1717 const CuMatrixBase<BaseFloat> &in,
1718 CuMatrixBase<BaseFloat> *out) const;
1719 virtual void Backprop(const std::string &debug_info,
1720 const ComponentPrecomputedIndexes *indexes,
1721 const CuMatrixBase<BaseFloat> &in_value,
1722 const CuMatrixBase<BaseFloat> &, // out_value,
1723 const CuMatrixBase<BaseFloat> &out_deriv,
1724 Component *to_update_in,
1725 CuMatrixBase<BaseFloat> *in_deriv) const;
1727 virtual void Read(std::istream &is, bool binary);
1728 virtual void Write(std::ostream &os, bool binary) const;
1730 virtual Component* Copy() const;
1732 // Some functions from base-class UpdatableComponent.
1733 virtual void Scale(BaseFloat scale);
1734 virtual void Add(BaseFloat alpha, const Component &other);
1735 virtual void PerturbParams(BaseFloat stddev);
1736 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1737 virtual int32 NumParameters() const;
1738 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1739 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1740 virtual void ZeroStats();
1742 // Some functions that are specific to this class:
1743 explicit LstmNonlinearityComponent(
1744 const LstmNonlinearityComponent &other);
1746 void Init(int32 cell_dim, BaseFloat param_stddev,
1747 BaseFloat tanh_self_repair_threshold,
1748 BaseFloat sigmoid_self_repair_threshold,
1749 BaseFloat self_repair_scale);
1751 void Init(std::string vector_filename,
1752 int32 rank, int32 update_period, BaseFloat num_samples_history,
1753 BaseFloat alpha, BaseFloat max_change_per_minibatch);
1755 private:
1757 // Initializes the natural-gradient object with the configuration we
1758 // use for this object, which for now is hardcoded at the C++ level.
1759 void InitNaturalGradient();
1762 // Notation: C is the cell dimension; it equals params_.NumCols().
1764 // The dimension of the parameter matrix is (3 x C);
1765 // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1766 CuMatrix<BaseFloat> params_;
1768 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1769 // equations (1) through (5), this is the sum of the values of the nonliearities
1770 // (used for diagnostics only). It is comparable to value_sum_ vector
1771 // in base-class NonlinearComponent.
1772 CuMatrix<double> value_sum_;
1774 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1775 // equations (1) through (5), this is the sum of the derivatives of the
1776 // nonliearities (used for diagnostics and to control self-repair). It is
1777 // comparable to the deriv_sum_ vector in base-class
1778 // NonlinearComponent.
1779 CuMatrix<double> deriv_sum_;
1781 // This matrix has dimension 10. The contents are a block of 5 self-repair
1782 // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1783 // self-repair scales (typically all 0.00001). These are for each of the 5
1784 // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1785 // more info).
1786 CuVector<BaseFloat> self_repair_config_;
1788 // This matrix has dimension 5. For each of the 5 nonlinearities in the LSTM
1789 // component (see comments in cu-math.h for more info), it contains the total,
1790 // over all frames represented in count_, of the number of dimensions that
1791 // were subject to self_repair. To get the self-repair proportion you should
1792 // divide by (count_ times cell_dim_).
1793 CuVector<double> self_repair_total_;
1795 // The total count (number of frames) corresponding to the stats in value_sum_
1796 // and deriv_sum_.
1797 double count_;
1799 // Preconditioner for the parameters of this component [operates in the space
1800 // of dimension C].
1801 // The preconditioner stores its own configuration values; we write and read
1802 // these, but not the preconditioner object itself.
1803 OnlineNaturalGradient preconditioner_;
1805 const LstmNonlinearityComponent &operator
1806 = (const LstmNonlinearityComponent &other); // Disallow.
1807 };
1812 /*
1813 * MaxPoolingComponent :
1814 * Maxpooling component was firstly used in ConvNet for selecting an
1815 * representative activation in an area. It inspired Maxout nonlinearity.
1816 * Each output element of this component is the maximum of a block of
1817 * input elements where the block has a 3D dimension (pool_x_size_,
1818 * pool_y_size_, pool_z_size_).
1819 * Blocks could overlap if the shift value on any axis is smaller
1820 * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
1821 * If the shift values are euqal to their pool size, there is no
1822 * overlap; while if they all equal 1, the blocks overlap to
1823 * the greatest possible extent.
1824 *
1825 * This component is designed to be used after a ConvolutionComponent
1826 * so that the input matrix is propagated from a 2d-convolutional layer.
1827 * This component implements 3d-maxpooling which performs
1828 * max pooling along the three axes.
1829 * Input : A matrix where each row is a vectorized 3D-tensor.
1830 * The 3D tensor has dimensions
1831 * x: (e.g. time)
1832 * y: (e.g. frequency)
1833 * z: (e.g. channels like number of filters in the ConvolutionComponent)
1834 *
1835 * The component assumes input vectorizations of type zyx
1836 * which is the default output vectorization type of a ConvolutionComponent.
1837 * e.g. for input vectorization of type zyx the input is vectorized by
1838 * spanning axes z, y and x of the tensor in that order.
1839 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1840 * the zyx vectorized input looks like
1841 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1842 *
1843 * Output : The output is also a 3D tensor vectorized in the zyx format.
1844 *
1845 * For information on the hyperparameters and parameters of this component see
1846 * the variable declarations.
1847 *
1848 *
1849 */
1851 class MaxpoolingComponent: public Component {
1852 public:
1854 MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
1855 pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
1856 pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
1857 // constructor using another component
1858 MaxpoolingComponent(const MaxpoolingComponent &component);
1860 virtual int32 InputDim() const;
1861 virtual int32 OutputDim() const;
1862 virtual void Check() const;
1864 virtual std::string Info() const;
1865 virtual void InitFromConfig(ConfigLine *cfl);
1866 virtual std::string Type() const { return "MaxpoolingComponent"; }
1867 virtual int32 Properties() const {
1868 return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
1869 kBackpropAdds;
1870 }
1872 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1873 const CuMatrixBase<BaseFloat> &in,
1874 CuMatrixBase<BaseFloat> *out) const;
1875 virtual void Backprop(const std::string &debug_info,
1876 const ComponentPrecomputedIndexes *indexes,
1877 const CuMatrixBase<BaseFloat> &in_value,
1878 const CuMatrixBase<BaseFloat> &out_value,
1879 const CuMatrixBase<BaseFloat> &out_deriv,
1880 Component *, // to_update,
1881 CuMatrixBase<BaseFloat> *in_deriv) const;
1883 virtual void Read(std::istream &is, bool binary); // This Read function
1884 // requires that the Component has the correct type.
1886 /// Write component to stream
1887 virtual void Write(std::ostream &os, bool binary) const;
1888 virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
1890 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1891 CuMatrix<BaseFloat> *patches) const;
1892 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1893 CuMatrixBase<BaseFloat> *in_deriv) const;
1895 protected:
1896 int32 input_x_dim_; // size of the input along x-axis
1897 // (e.g. number of time steps)
1898 int32 input_y_dim_; // size of input along y-axis
1899 // (e.g. number of mel-frequency bins)
1900 int32 input_z_dim_; // size of input along z-axis
1901 // (e.g. number of filters in the ConvolutionComponent)
1903 int32 pool_x_size_; // size of the pooling window along x-axis
1904 int32 pool_y_size_; // size of the pooling window along y-axis
1905 int32 pool_z_size_; // size of the pooling window along z-axis
1907 int32 pool_x_step_; // the number of steps taken along x-axis of input
1908 // before computing the next pool
1909 int32 pool_y_step_; // the number of steps taken along y-axis of input
1910 // before computing the next pool
1911 int32 pool_z_step_; // the number of steps taken along z-axis of input
1912 // before computing the next pool
1914 };
1917 /**
1918 CompositeComponent is a component representing a sequence of
1919 [simple] components. The config line would be something like the following
1920 (imagine this is all on one line):
1922 component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
1923 component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
1924 component2='type=RectifiedLinearComponent dim=10000' \
1925 component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
1927 The reason you might want to use this component, instead of directly using
1928 the same sequence of components in the config file, is to save GPU memory (at
1929 the expense of more compute)-- because doing it like this means we have to
1930 re-do parts of the forward pass in the backprop phase, but we avoid using
1931 much memory for very long (and you can make the memory usage very small by
1932 making max-rows-process small). We inherit from UpdatableComponent just in
1933 case one or more of the components in the sequence are updatable.
1935 It is an error to nest a CompositeComponent inside a CompositeComponent.
1936 The same effect can be accomplished by specifying a smaller max-rows-process
1937 in a single CompositeComponent.
1938 */
1939 class CompositeComponent: public UpdatableComponent {
1940 public:
1941 virtual int32 InputDim() const;
1942 virtual int32 OutputDim() const;
1944 virtual std::string Info() const;
1946 virtual void InitFromConfig(ConfigLine *cfl);
1948 virtual Component* Copy() const;
1950 CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
1952 // Initialize from this list of components; takes ownership of the pointers.
1953 void Init(const std::vector<Component*> &components,
1954 int32 max_rows_process);
1956 virtual std::string Type() const { return "CompositeComponent"; }
1958 // The properties depend on the properties of the constituent components. As
1959 // a special case, we never return kStoresStats in the properties: by default
1960 // we store things like activation stats (e.g. for nonlinear components like
1961 // ReLU) as part of the backprop. This means we may wastefully store stats
1962 // even when not requested, but it does save time as a separate StoreStats()
1963 // call would involve propagating the internals.
1964 virtual int32 Properties() const;
1966 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1967 const CuMatrixBase<BaseFloat> &in,
1968 CuMatrixBase<BaseFloat> *out) const;
1969 virtual void Backprop(const std::string &debug_info,
1970 const ComponentPrecomputedIndexes *indexes,
1971 const CuMatrixBase<BaseFloat> &in_value,
1972 const CuMatrixBase<BaseFloat> &, // out_value
1973 const CuMatrixBase<BaseFloat> &out_deriv,
1974 Component *to_update,
1975 CuMatrixBase<BaseFloat> *in_deriv) const;
1977 // note, we don't implement StoreStats() as it would be inefficient. Instead,
1978 // by default we call StoreStats() on all members that have the flag set,
1979 // inside the Backprop.
1980 virtual void ZeroStats();
1982 virtual void Read(std::istream &is, bool binary);
1983 virtual void Write(std::ostream &os, bool binary) const;
1985 // Don't implement Copy() at this level: implement it in the child class.
1987 // Some functions from base-class UpdatableComponent.
1988 virtual void SetUnderlyingLearningRate(BaseFloat lrate);
1989 virtual void SetActualLearningRate(BaseFloat lrate);
1990 virtual void SetAsGradient();
1991 virtual void Scale(BaseFloat scale);
1992 virtual void Add(BaseFloat alpha, const Component &other);
1993 virtual void PerturbParams(BaseFloat stddev);
1994 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1995 virtual int32 NumParameters() const;
1996 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1997 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1999 // note: we dont implement the StoreStats function as it would be quite
2000 // expensive; instead, by default we call StoreStats() for any components that
2001 // want to store stats, as part of the backprop pass. This is not 100% ideal
2002 // but it will usually do what you want. We can revisit this later if needed.
2004 // Functions to iterate over the internal components
2006 int32 NumComponents() const { return components_.size();}
2007 /// Gets the ith component in this component.
2008 /// The ordering is the same as in the config line. The caller
2009 /// does not own the received component.
2010 const Component* GetComponent(int32 i) const;
2011 /// Sets the ith component. After this call, CompositeComponent owns
2012 /// the reference to the argument component. Frees the previous
2013 /// ith component.
2014 void SetComponent(int32 i, Component *component);
2016 virtual ~CompositeComponent() { DeletePointers(&components_); }
2017 private:
2018 // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2019 // at the output of the i'th component.
2020 inline MatrixStrideType GetStrideType(int32 i) const;
2022 // returns true if at least one of 'components_' returns the kUpdatable flag
2023 // in its flags.
2024 bool IsUpdatable() const;
2026 // the maximum number of
2027 int32 max_rows_process_;
2028 std::vector<Component*> components_;
2030 };
2033 } // namespace nnet3
2034 } // namespace kaldi
2037 #endif