1 // nnet3/nnet-simple-component.h
3 // Copyright 2011-2013 Karel Vesely
4 // 2012-2015 Johns Hopkins University (author: Daniel Povey)
5 // 2013 Xiaohui Zhang
6 // 2014-2015 Vijayaditya Peddinti
7 // 2014-2015 Guoguo Chen
8 // 2015 Daniel Galvez
9 // 2015 Tom Ko
11 // See ../../COPYING for clarification regarding multiple authors
12 //
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 //
17 // http://www.apache.org/licenses/LICENSE-2.0
18 //
19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
22 // MERCHANTABLITY OR NON-INFRINGEMENT.
23 // See the Apache 2 License for the specific language governing permissions and
24 // limitations under the License.
26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
29 #include "nnet3/nnet-common.h"
30 #include "nnet3/nnet-component-itf.h"
31 #include "nnet3/natural-gradient-online.h"
32 #include <iostream>
34 namespace kaldi {
35 namespace nnet3 {
37 /// @file nnet-simple-component.h
38 /// This file contains declarations of components that are "simple", meaning
39 /// they don't care about the indexes they are operating on, produce one
40 /// output for one input, and return the kSimpleComponent flag in their
41 /// Properties(): for example, tanh and affine components. In
42 /// nnet-general-component.h there are components that don't fit this pattern.
44 // This "nnet3" version of the p-norm component only supports the 2-norm.
45 class PnormComponent: public Component {
46 public:
47 void Init(int32 input_dim, int32 output_dim);
48 explicit PnormComponent(int32 input_dim, int32 output_dim) {
49 Init(input_dim, output_dim);
50 }
51 virtual int32 Properties() const {
52 return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
53 }
54 PnormComponent(): input_dim_(0), output_dim_(0) { }
55 virtual std::string Type() const { return "PnormComponent"; }
56 virtual void InitFromConfig(ConfigLine *cfl);
57 virtual int32 InputDim() const { return input_dim_; }
58 virtual int32 OutputDim() const { return output_dim_; }
59 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
60 const CuMatrixBase<BaseFloat> &in,
61 CuMatrixBase<BaseFloat> *out) const;
62 virtual void Backprop(const std::string &debug_info,
63 const ComponentPrecomputedIndexes *indexes,
64 const CuMatrixBase<BaseFloat> &in_value,
65 const CuMatrixBase<BaseFloat> &out_value,
66 const CuMatrixBase<BaseFloat> &out_deriv,
67 Component *to_update,
68 CuMatrixBase<BaseFloat> *in_deriv) const;
69 virtual Component* Copy() const { return new PnormComponent(input_dim_,
70 output_dim_); }
72 virtual void Read(std::istream &is, bool binary); // This Read function
73 // requires that the Component has the correct type.
75 /// Write component to stream
76 virtual void Write(std::ostream &os, bool binary) const;
78 protected:
79 int32 input_dim_;
80 int32 output_dim_;
81 };
83 // This component randomly zeros dropout_proportion of the input
84 // and the derivatives are backpropagated through the nonzero inputs.
85 // Typically this component used during training but not in test time.
86 // The idea is described under the name Dropout, in the paper
87 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
88 class DropoutComponent : public RandomComponent {
89 public:
90 void Init(int32 dim, BaseFloat dropout_proportion = 0.0,
91 bool dropout_per_frame = false);
93 DropoutComponent(int32 dim, BaseFloat dropout = 0.0,
94 bool dropout_per_frame = false) {
95 Init(dim, dropout, dropout_per_frame);
96 }
98 DropoutComponent(): dim_(0), dropout_proportion_(0.0),
99 dropout_per_frame_(false) { }
101 virtual int32 Properties() const {
102 return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
103 }
104 virtual std::string Type() const { return "DropoutComponent"; }
106 virtual void InitFromConfig(ConfigLine *cfl);
108 virtual int32 InputDim() const { return dim_; }
110 virtual int32 OutputDim() const { return dim_; }
112 virtual void Read(std::istream &is, bool binary);
114 // Write component to stream
115 virtual void Write(std::ostream &os, bool binary) const;
117 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
118 const CuMatrixBase<BaseFloat> &in,
119 CuMatrixBase<BaseFloat> *out) const;
120 virtual void Backprop(const std::string &debug_info,
121 const ComponentPrecomputedIndexes *indexes,
122 const CuMatrixBase<BaseFloat> &in_value,
123 const CuMatrixBase<BaseFloat> &out_value,
124 const CuMatrixBase<BaseFloat> &out_deriv,
125 Component *to_update,
126 CuMatrixBase<BaseFloat> *in_deriv) const;
127 virtual Component* Copy() const { return new DropoutComponent(dim_,
128 dropout_proportion_,
129 dropout_per_frame_); }
130 virtual std::string Info() const;
132 void SetDropoutProportion(BaseFloat dropout_proportion) {
133 dropout_proportion_ = dropout_proportion;
134 }
136 private:
137 int32 dim_;
138 /// dropout-proportion is the proportion that is dropped out,
139 /// e.g. if 0.1, we set 10% to zero value.
140 BaseFloat dropout_proportion_;
141 bool dropout_per_frame_;
142 };
144 class ElementwiseProductComponent: public Component {
145 public:
146 void Init(int32 input_dim, int32 output_dim);
147 explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
148 Init(input_dim, output_dim);
149 }
150 virtual int32 Properties() const {
151 return kSimpleComponent|kBackpropNeedsInput;
152 }
153 ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
154 virtual std::string Type() const { return "ElementwiseProductComponent"; }
155 virtual void InitFromConfig(ConfigLine *cfl);
156 virtual int32 InputDim() const { return input_dim_; }
157 virtual int32 OutputDim() const { return output_dim_; }
158 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
159 const CuMatrixBase<BaseFloat> &in,
160 CuMatrixBase<BaseFloat> *out) const;
161 virtual void Backprop(const std::string &debug_info,
162 const ComponentPrecomputedIndexes *indexes,
163 const CuMatrixBase<BaseFloat> &in_value,
164 const CuMatrixBase<BaseFloat> &out_value,
165 const CuMatrixBase<BaseFloat> &out_deriv,
166 Component *to_update,
167 CuMatrixBase<BaseFloat> *in_deriv) const;
168 virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
169 output_dim_); }
171 virtual void Read(std::istream &is, bool binary); // This Read function
172 // requires that the Component has the correct type.
174 /// Write component to stream
175 virtual void Write(std::ostream &os, bool binary) const;
177 protected:
178 int32 input_dim_;
179 int32 output_dim_;
180 };
182 class NormalizeComponent: public Component {
183 public:
184 void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
185 explicit NormalizeComponent(int32 input_dim,
186 BaseFloat target_rms = 1.0,
187 bool add_log_stddev = false) {
188 Init(input_dim, target_rms, add_log_stddev);
189 }
190 explicit NormalizeComponent(const NormalizeComponent &other);
191 // note: there is some special code in NonlinerComponent::Info() that
192 // specifically caters to this class.
193 virtual int32 Properties() const {
194 return (add_log_stddev_ ?
195 kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
196 kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
197 kBackpropAdds|kBackpropInPlace);
198 }
199 NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
200 virtual std::string Type() const { return "NormalizeComponent"; }
201 virtual void InitFromConfig(ConfigLine *cfl);
202 virtual Component* Copy() const { return new NormalizeComponent(*this); }
203 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
204 const CuMatrixBase<BaseFloat> &in,
205 CuMatrixBase<BaseFloat> *out) const;
206 virtual void Backprop(const std::string &debug_info,
207 const ComponentPrecomputedIndexes *indexes,
208 const CuMatrixBase<BaseFloat> &in_value,
209 const CuMatrixBase<BaseFloat> &, // out_value
210 const CuMatrixBase<BaseFloat> &out_deriv,
211 Component *to_update,
212 CuMatrixBase<BaseFloat> *in_deriv) const;
214 virtual void Read(std::istream &is, bool binary);
215 virtual void Write(std::ostream &os, bool binary) const;
216 virtual int32 InputDim() const { return input_dim_; }
217 virtual int32 OutputDim() const {
218 return (input_dim_ + (add_log_stddev_ ? 1 : 0));
219 }
220 virtual std::string Info() const;
221 private:
222 NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
223 enum { kExpSquaredNormFloor = -66 };
224 static const BaseFloat kSquaredNormFloor;
225 int32 input_dim_;
226 BaseFloat target_rms_; // The target rms for outputs.
227 // about 0.7e-20. We need a value that's exactly representable in
228 // float and whose inverse square root is also exactly representable
229 // in float (hence, an even power of two).
231 bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
232 // is an extra dimension of the output.
233 };
236 /*
237 Implements the sigmoid nonlinearity, i.e. the function y = exp(-x).
239 Configuration values accepted:
240 dim Dimension of this component, e.g. 1024
242 Configuration values inherited from NonlinearComponent, and their
243 local meanings:
244 self-repair-lower-threshold e.g. self-repair-lower-threshold=0.05. This
245 controls the self-repair mechanism, which for sigmoid units
246 consists of identifying units which are oversaturated (i.e.
247 usually close to -1 or +1) and nudging the inputs to be
248 closer to zero. It gates on the average derivative of the
249 nonlinearity, which for sigmoid is a value between 0 and
250 0.25. For units where the average function-derivative
251 accumulated during this iteration (job) of training is less
252 than this threshold, we activate self-repair, which consists
253 of adding (-self-repair-scale * (2*the output of the
254 nonlinearity - 1.0)) to the backpropagated derivatives.
255 This just happens to be a convenient-to-compute function
256 that's +1 for large negative inputs, and -1 for large positive
257 inputs, and smooth in between.
258 The default value of this is -1000, which the code internally
259 maps to 0.05 which is suitable for sigmoid units; if you do set it,
260 you can set it to a value like 0.025 or 0.075.
261 self-repair-scale Scale for the self-repair mechanism; see comments above.
262 default=0, but we usually set this to 1.0e-05 (or
263 occasionally 1.0e-04) in the scripts.
265 */
266 class SigmoidComponent: public NonlinearComponent {
267 public:
268 explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
269 SigmoidComponent() { }
270 virtual std::string Type() const { return "SigmoidComponent"; }
271 virtual int32 Properties() const {
272 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
273 }
274 virtual Component* Copy() const { return new SigmoidComponent(*this); }
275 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
276 const CuMatrixBase<BaseFloat> &in,
277 CuMatrixBase<BaseFloat> *out) const;
278 virtual void Backprop(const std::string &debug_info,
279 const ComponentPrecomputedIndexes *indexes,
280 const CuMatrixBase<BaseFloat> &, //in_value
281 const CuMatrixBase<BaseFloat> &out_value,
282 const CuMatrixBase<BaseFloat> &out_deriv,
283 Component *to_update,
284 CuMatrixBase<BaseFloat> *in_deriv) const;
285 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
286 private:
287 // this function is called from Backprop code and only does something if the
288 // self-repair-scale config value is set.
289 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
290 CuMatrixBase<BaseFloat> *in_deriv,
291 SigmoidComponent *to_update) const;
293 SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
294 };
296 /*
297 Implements the tanh nonlinearity, i.e. the function y = tanh(x).
299 Configuration values accepted:
300 dim Dimension of this component, e.g. 1024
302 Configuration values inherited from NonlinearComponent, and their
303 local meanings:
304 self-repair-lower-threshold e.g. self-repair-lower-threshold=0.2. This
305 controls the self-repair mechanism, which for tanh units
306 consists of identifying units which are oversaturated (i.e.
307 usually close to -1 or +1) and nudging the inputs to be
308 closer to zero. It gates on the average derivative of
309 the nonlinearity, which for tanh is a value between 0 and 1.
310 For units where the average function-derivative accumulated
311 during this iteration (job) of training is less than
312 this threshold, we activate self-repair, which consists of
313 adding (-self-repair-scale * the output of the nonlinearity),
314 i.e. (-self-repair-scale * tanh(x)) to the backpropagated
315 derivatives.
316 The default value of this is -1000, which the code internally
317 maps to 0.2 which is suitable for tanh units; if you do set it,
318 you can set it to a value like 0.1 or 0.3.
319 self-repair-scale Scale for the self-repair mechanism; see comments above.
320 default=0, but we usually set this to 1.0e-05 (or
321 occasionally 1.0e-04) in the scripts.
322 */
323 class TanhComponent: public NonlinearComponent {
324 public:
325 explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
326 TanhComponent() { }
327 virtual std::string Type() const { return "TanhComponent"; }
328 virtual Component* Copy() const { return new TanhComponent(*this); }
329 virtual int32 Properties() const {
330 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
331 }
332 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
333 const CuMatrixBase<BaseFloat> &in,
334 CuMatrixBase<BaseFloat> *out) const;
335 virtual void Backprop(const std::string &debug_info,
336 const ComponentPrecomputedIndexes *indexes,
337 const CuMatrixBase<BaseFloat> &, //in_value
338 const CuMatrixBase<BaseFloat> &out_value,
339 const CuMatrixBase<BaseFloat> &out_deriv,
340 Component *to_update,
341 CuMatrixBase<BaseFloat> *in_deriv) const;
342 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
343 private:
344 // this function is called from Backprop code and only does something if the
345 // self-repair-scale config value is set.
346 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
347 CuMatrixBase<BaseFloat> *in_deriv,
348 TanhComponent *to_update) const;
350 TanhComponent &operator = (const TanhComponent &other); // Disallow.
351 };
354 /*
355 Implements the Rectified Linear Unit nonlinearity, a.k.a. ReLU.
357 Configuration values accepted:
358 dim Dimension of this component, e.g. 1024
360 Configuration values inherited from NonlinearComponent, and their
361 local meanings:
362 self-repair-lower-threshold e.g. self-repair-lower-threshold=0.05. (Lower
363 threshold for self-repair, if set; in this case acts on
364 the average function-derivative, which is the proportion
365 of the time the output is > 0. For any unit where the
366 average function-derivative is lower than this threshold,
367 we add 'self-repair-scale' to the backpropagated
368 derivatives in backprop. There is no default
369 (default=-1000, which is interpreted specially).
370 self-repair-upper-threshold e.g. self-repair-upper-threshold=0.95.
371 Like self-repair-lower-threshold, but controls self-repair
372 for units that are active *too* much of the time. Units
373 whose average function-derivative exceeds this threshold
374 will have the negative of 'self-repair-scale' added to their
375 input derivatives in backprop. There is no default
376 (default=-1000, which is interpreted specially).
377 self-repair-scale Scale for the self-repair mechanism; see comments for
378 self-repair-lower-threshold and self-repair-upper-threshold
379 for details. default=0, but we usually set this to 1.0e-05
380 (or occasionally 1.0e-04) in the scripts.
381 */
382 class RectifiedLinearComponent: public NonlinearComponent {
383 public:
384 explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
385 NonlinearComponent(other) { }
386 RectifiedLinearComponent() { }
387 virtual std::string Type() const { return "RectifiedLinearComponent"; }
388 virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
389 virtual int32 Properties() const {
390 return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
391 kStoresStats;
392 }
393 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
394 const CuMatrixBase<BaseFloat> &in,
395 CuMatrixBase<BaseFloat> *out) const;
396 virtual void Backprop(const std::string &debug_info,
397 const ComponentPrecomputedIndexes *indexes,
398 const CuMatrixBase<BaseFloat> &, //in_value
399 const CuMatrixBase<BaseFloat> &out_value,
400 const CuMatrixBase<BaseFloat> &out_deriv,
401 Component *to_update,
402 CuMatrixBase<BaseFloat> *in_deriv) const;
403 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
405 private:
406 // this function is called from Backprop code and only does something if the
407 // self-repair-scale config value is set.
408 void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
409 RectifiedLinearComponent *to_update) const;
411 RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
412 };
414 /**
415 This component is a fixed (non-trainable) nonlinearity that sums its inputs
416 to produce outputs. Currently the only supported configuration is that its
417 input-dim is interpreted as consisting of n blocks, and the output is just a
418 summation over the n blocks, where n = input-dim / output-dim, so for instance
419 output[n] = input[n] + input[block-size + n] + .... .
420 Later if needed we can add a configuration variable that allows you to sum
421 over 'interleaved' input.
422 */
423 class SumReduceComponent: public Component {
424 public:
425 void Init(int32 input_dim, int32 output_dim);
426 explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
427 Init(input_dim, output_dim);
428 }
429 virtual int32 Properties() const {
430 return kSimpleComponent|kLinearInInput;
431 }
432 SumReduceComponent(): input_dim_(0), output_dim_(0) { }
433 virtual std::string Type() const { return "SumReduceComponent"; }
434 virtual void InitFromConfig(ConfigLine *cfl);
435 virtual int32 InputDim() const { return input_dim_; }
436 virtual int32 OutputDim() const { return output_dim_; }
437 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
438 const CuMatrixBase<BaseFloat> &in,
439 CuMatrixBase<BaseFloat> *out) const;
440 virtual void Backprop(const std::string &debug_info,
441 const ComponentPrecomputedIndexes *indexes,
442 const CuMatrixBase<BaseFloat> &, // in_value
443 const CuMatrixBase<BaseFloat> &, // out_value,
444 const CuMatrixBase<BaseFloat> &out_deriv,
445 Component *, // to_update
446 CuMatrixBase<BaseFloat> *in_deriv) const;
447 virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
448 output_dim_); }
450 virtual void Read(std::istream &is, bool binary); // This Read function
451 // requires that the Component has the correct type.
453 /// Write component to stream
454 virtual void Write(std::ostream &os, bool binary) const;
456 protected:
457 int32 input_dim_;
458 int32 output_dim_;
459 };
462 class FixedAffineComponent;
463 class FixedScaleComponent;
464 class PerElementScaleComponent;
465 class PerElementOffsetComponent;
467 // Affine means a linear function plus an offset.
468 // Note: although this class can be instantiated, it also
469 // functions as a base-class for more specialized versions of
470 // AffineComponent.
471 class AffineComponent: public UpdatableComponent {
472 friend class SoftmaxComponent; // Friend declaration relates to mixing up.
473 public:
475 virtual int32 InputDim() const { return linear_params_.NumCols(); }
476 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
478 virtual std::string Info() const;
479 virtual void InitFromConfig(ConfigLine *cfl);
481 AffineComponent() { } // use Init to really initialize.
482 virtual std::string Type() const { return "AffineComponent"; }
483 virtual int32 Properties() const {
484 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
485 kBackpropNeedsInput|kBackpropAdds;
486 }
489 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
490 const CuMatrixBase<BaseFloat> &in,
491 CuMatrixBase<BaseFloat> *out) const;
492 virtual void Backprop(const std::string &debug_info,
493 const ComponentPrecomputedIndexes *indexes,
494 const CuMatrixBase<BaseFloat> &in_value,
495 const CuMatrixBase<BaseFloat> &, // out_value
496 const CuMatrixBase<BaseFloat> &out_deriv,
497 Component *to_update,
498 CuMatrixBase<BaseFloat> *in_deriv) const;
500 virtual void Read(std::istream &is, bool binary);
501 virtual void Write(std::ostream &os, bool binary) const;
503 virtual Component* Copy() const;
506 // Some functions from base-class UpdatableComponent.
507 virtual void Scale(BaseFloat scale);
508 virtual void Add(BaseFloat alpha, const Component &other);
509 virtual void PerturbParams(BaseFloat stddev);
510 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
511 virtual int32 NumParameters() const;
512 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
513 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
515 // Some functions that are specific to this class.
517 // This new function is used when mixing up:
518 virtual void SetParams(const VectorBase<BaseFloat> &bias,
519 const MatrixBase<BaseFloat> &linear);
520 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
521 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
522 explicit AffineComponent(const AffineComponent &other);
523 // The next constructor is used in converting from nnet1.
524 AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
525 const CuVectorBase<BaseFloat> &bias_params,
526 BaseFloat learning_rate);
527 void Init(int32 input_dim, int32 output_dim,
528 BaseFloat param_stddev, BaseFloat bias_stddev);
529 void Init(std::string matrix_filename);
531 // This function resizes the dimensions of the component, setting the
532 // parameters to zero, while leaving any other configuration values the same.
533 virtual void Resize(int32 input_dim, int32 output_dim);
535 // The following functions are used for collapsing multiple layers
536 // together. They return a pointer to a new Component equivalent to
537 // the sequence of two components. We haven't implemented this for
538 // FixedLinearComponent yet.
539 Component *CollapseWithNext(const AffineComponent &next) const ;
540 Component *CollapseWithNext(const FixedAffineComponent &next) const;
541 Component *CollapseWithNext(const FixedScaleComponent &next) const;
542 Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
544 protected:
545 friend class NaturalGradientAffineComponent;
546 // This function Update() is for extensibility; child classes may override
547 // this, e.g. for natural gradient update.
548 virtual void Update(
549 const std::string &debug_info,
550 const CuMatrixBase<BaseFloat> &in_value,
551 const CuMatrixBase<BaseFloat> &out_deriv) {
552 UpdateSimple(in_value, out_deriv);
553 }
554 // UpdateSimple is used when *this is a gradient. Child classes may override
555 // this if needed, but typically won't need to.
556 virtual void UpdateSimple(
557 const CuMatrixBase<BaseFloat> &in_value,
558 const CuMatrixBase<BaseFloat> &out_deriv);
560 const AffineComponent &operator = (const AffineComponent &other); // Disallow.
561 CuMatrix<BaseFloat> linear_params_;
562 CuVector<BaseFloat> bias_params_;
563 };
565 class RepeatedAffineComponent;
567 /// This class implements an affine transform using a block diagonal matrix
568 /// e.g., one whose weight matrix is all zeros except for blocks on the
569 /// diagonal. All these blocks have the same dimensions.
570 /// input-dim: num cols of block diagonal matrix.
571 /// output-dim: num rows of block diagonal matrix.
572 /// num-blocks: number of blocks in diagonal of the matrix.
573 /// num-blocks must divide both input-dim and output-dim
574 class BlockAffineComponent : public UpdatableComponent {
575 public:
576 virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
577 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
579 virtual std::string Info() const;
580 virtual void InitFromConfig(ConfigLine *cfl);
582 BlockAffineComponent() { }
583 virtual std::string Type() const { return "BlockAffineComponent"; }
584 virtual int32 Properties() const {
585 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
586 kBackpropNeedsInput|kBackpropAdds;
587 }
589 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
590 const CuMatrixBase<BaseFloat> &in,
591 CuMatrixBase<BaseFloat> *out) const;
593 virtual void Backprop(const std::string &debug_info,
594 const ComponentPrecomputedIndexes *indexes,
595 const CuMatrixBase<BaseFloat> &in_value,
596 const CuMatrixBase<BaseFloat> &, // out_value
597 const CuMatrixBase<BaseFloat> &out_deriv,
598 Component *to_update,
599 CuMatrixBase<BaseFloat> *in_deriv) const;
601 virtual void Read(std::istream &is, bool binary);
602 virtual void Write(std::ostream &os, bool binary) const;
604 virtual Component* Copy() const;
606 // Functions from base-class UpdatableComponent.
607 virtual void Scale(BaseFloat scale);
608 virtual void Add(BaseFloat alpha, const Component &other);
609 virtual void PerturbParams(BaseFloat stddev);
610 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
611 virtual int32 NumParameters() const;
612 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
613 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
615 // BlockAffine-specific functions.
616 void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
617 BaseFloat param_stddev, BaseFloat bias_mean,
618 BaseFloat bias_stddev);
619 explicit BlockAffineComponent(const BlockAffineComponent &other);
620 explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
621 protected:
622 // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
623 // equal size. The blocks are stored in linear_params_ as
624 // [ M
625 // N
626 // O ] but we actually treat it as the matrix:
627 // [ M 0 0
628 // 0 N 0
629 // 0 0 O ]
630 CuMatrix<BaseFloat> linear_params_;
631 CuVector<BaseFloat> bias_params_;
632 int32 num_blocks_;
633 private:
634 const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
635 };
637 class RepeatedAffineComponent: public UpdatableComponent {
638 public:
640 virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
641 virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
643 virtual std::string Info() const;
644 virtual void InitFromConfig(ConfigLine *cfl);
646 RepeatedAffineComponent() { } // use Init to really initialize.
647 virtual std::string Type() const { return "RepeatedAffineComponent"; }
648 virtual int32 Properties() const {
649 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
650 kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
651 }
652 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
653 const CuMatrixBase<BaseFloat> &in,
654 CuMatrixBase<BaseFloat> *out) const;
655 virtual void Backprop(const std::string &debug_info,
656 const ComponentPrecomputedIndexes *indexes,
657 const CuMatrixBase<BaseFloat> &in_value,
658 const CuMatrixBase<BaseFloat> &, // out_value
659 const CuMatrixBase<BaseFloat> &out_deriv,
660 Component *to_update,
661 CuMatrixBase<BaseFloat> *in_deriv) const;
663 virtual void Read(std::istream &is, bool binary);
664 virtual void Write(std::ostream &os, bool binary) const;
666 virtual Component* Copy() const;
668 // Some functions from base-class UpdatableComponent.
669 virtual void Scale(BaseFloat scale);
670 virtual void Add(BaseFloat alpha, const Component &other);
671 virtual void PerturbParams(BaseFloat stddev);
672 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
673 virtual int32 NumParameters() const;
674 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
675 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
677 // Some functions that are specific to this class.
678 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
679 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
680 explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
682 void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
683 BaseFloat param_stddev, BaseFloat bias_mean,
684 BaseFloat bias_stddev);
685 friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
686 protected:
687 // This function Update(), called from backprop, is broken out for
688 // extensibility to natural gradient update.
689 virtual void Update(
690 const CuMatrixBase<BaseFloat> &in_value,
691 const CuMatrixBase<BaseFloat> &out_deriv);
693 // This function does nothing here but is redefined in child-class
694 // NaturalGradientRepeatedAffineComponent. This help avoid repeated code.
695 virtual void SetNaturalGradientConfigs() { }
697 const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
698 CuMatrix<BaseFloat> linear_params_;
699 CuVector<BaseFloat> bias_params_;
700 int32 num_repeats_;
701 };
703 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
704 public:
705 // Use Init() to really initialize.
706 NaturalGradientRepeatedAffineComponent() { }
708 // Most of the public functions are inherited from RepeatedAffineComponent.
709 virtual std::string Type() const {
710 return "NaturalGradientRepeatedAffineComponent";
711 }
713 virtual Component* Copy() const;
715 // Copy constructor
716 explicit NaturalGradientRepeatedAffineComponent(
717 const NaturalGradientRepeatedAffineComponent &other);
718 private:
719 virtual void Update(
720 const CuMatrixBase<BaseFloat> &in_value,
721 const CuMatrixBase<BaseFloat> &out_deriv);
723 const NaturalGradientRepeatedAffineComponent &operator=(
724 const NaturalGradientRepeatedAffineComponent &other); // Disallow.
726 // Applies the default configuration to preconditioner_in_.
727 virtual void SetNaturalGradientConfigs();
729 // For efficiency reasons we only apply the natural gradient to the input
730 // side, i.e. not to the space of output derivatives-- we believe the input
731 // side is the more important side. We don't make the natural-gradient
732 // configurable; we just give it a reasonable configuration.
733 // Instead of using the individual data-points, for efficiency reasons we use
734 // the distribution of per-minibatch summed derivatives over each dimension of
735 // the output space, as the source for the Fisher matrix.
736 OnlineNaturalGradient preconditioner_in_;
737 };
739 class SoftmaxComponent: public NonlinearComponent {
740 public:
741 explicit SoftmaxComponent(const SoftmaxComponent &other):
742 NonlinearComponent(other) { }
743 SoftmaxComponent() { }
744 virtual std::string Type() const { return "SoftmaxComponent"; }
745 virtual int32 Properties() const {
746 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
747 }
748 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
749 const CuMatrixBase<BaseFloat> &in,
750 CuMatrixBase<BaseFloat> *out) const;
751 virtual void Backprop(const std::string &debug_info,
752 const ComponentPrecomputedIndexes *indexes,
753 const CuMatrixBase<BaseFloat> &in_value,
754 const CuMatrixBase<BaseFloat> &out_value,
755 const CuMatrixBase<BaseFloat> &out_deriv,
756 Component *to_update,
757 CuMatrixBase<BaseFloat> *in_deriv) const;
758 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
760 virtual Component* Copy() const { return new SoftmaxComponent(*this); }
761 private:
762 SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
763 };
766 /*
767 Implements the log of a softmax nonlinearity, so it's the same
768 as shifting each input vector by a constant offset so that, when
769 exponentiated, it would sum to one.
771 We usually use this in place of softmax because the log-scale
772 output will not saturate.
774 Configuration values accepted:
775 dim e.g. dim=8061. Usually this is the last component
776 in a network, so 'dim' is the number of classes.
777 */
778 class LogSoftmaxComponent: public NonlinearComponent {
779 public:
780 explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
781 NonlinearComponent(other) { }
782 LogSoftmaxComponent() { }
783 virtual std::string Type() const { return "LogSoftmaxComponent"; }
784 virtual int32 Properties() const {
785 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
786 }
787 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
788 const CuMatrixBase<BaseFloat> &in,
789 CuMatrixBase<BaseFloat> *out) const;
790 virtual void Backprop(const std::string &debug_info,
791 const ComponentPrecomputedIndexes *indexes,
792 const CuMatrixBase<BaseFloat> &in_value,
793 const CuMatrixBase<BaseFloat> &out_value,
794 const CuMatrixBase<BaseFloat> &out_deriv,
795 Component *to_update,
796 CuMatrixBase<BaseFloat> *in_deriv) const;
798 virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
799 private:
800 LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
801 };
803 /*
804 Keywords: natural gradient descent, NG-SGD, naturalgradient. For
805 the top-level of the natural gradient code look here, and also in
806 nnet-precondition-online.h.
807 NaturalGradientAffineComponent is
808 a version of AffineComponent that has a non-(multiple of unit) learning-rate
809 matrix. See nnet-precondition-online.h for a description of the technique.
810 It is described, under the name Online NG-SGD, in the paper "Parallel
811 training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
812 workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
814 Configuration values accepted by this component:
816 Values inherited from UpdatableComponent (see its declaration in
817 nnet-component-itf for details):
818 learning-rate
819 learning-rate-factor
820 max-change
822 Values used in initializing the component's parameters:
823 input-dim e.g. input-dim=1024. The input dimension.
824 output-dim e.g. output-dim=1024. The output dimension.
825 param-stddev e.g. param-stddev=0.025. The standard deviation
826 used to randomly initialize the linear parameters
827 (as Gaussian random values * param-stddev).
828 Defaults to 1/sqrt(input-dim), which is Glorot
829 initialization.
830 bias-stddev e.g. bias-stddev=0.0. The standard deviation
831 used to randomly initialize the bias parameters.
832 Defaults to 1.0 but we usually set it to 0.0
833 in the config.
834 bias-mean e.g. bias-mean=1.0. Allows you to ininialize the
835 bias parameters with an offset. Default is 0.0
836 which is normally suitable
838 matrix e.g. matrix=foo/bar/init.mat May be used as an
839 alternative to (input-dim, output-dim, param-stddev,
840 bias-stddev, bias-mean) to initialize the parameters.
841 Dimension is output-dim by (input-dim + 1), last
842 column is interpreted as the bias.
844 Options to the natural gradient (you won't normally have to set these,
845 the defaults are suitable):
847 num-samples-history Number of frames used as the time-constant to
848 determine how 'up-to-date' the Fisher-matrix
849 estimates are. Smaller -> more up-to-date, but more
850 noisy. default=2000.
851 alpha Constant that determines how much we smooth the
852 Fisher-matrix estimates with the unit matrix.
853 Larger means more smoothing. default=4.0
854 rank-in Rank used in low-rank-plus-unit estimate of Fisher
855 matrix in the input space. default=20.
856 rank-out Rank used in low-rank-plus-unit estimate of Fisher
857 matrix in the output-derivative space. default=80.
858 update-period Determines after with what frequency (in
859 minibatches) we update the Fisher-matrix estimates;
860 making this > 1 saves a little time in training.
861 default=4.
862 */
863 class NaturalGradientAffineComponent: public AffineComponent {
864 public:
865 virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
866 virtual void Read(std::istream &is, bool binary);
867 virtual void Write(std::ostream &os, bool binary) const;
868 void Init(int32 input_dim, int32 output_dim,
869 BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
870 int32 rank_in, int32 rank_out, int32 update_period,
871 BaseFloat num_samples_history, BaseFloat alpha);
872 void Init(int32 rank_in, int32 rank_out, int32 update_period,
873 BaseFloat num_samples_history,
874 BaseFloat alpha, std::string matrix_filename);
875 // this constructor does not really initialize, use Init() or Read().
876 NaturalGradientAffineComponent();
877 void Resize(int32 input_dim, int32 output_dim);
878 void InitFromConfig(ConfigLine *cfl);
879 virtual std::string Info() const;
880 virtual Component* Copy() const;
881 virtual void Scale(BaseFloat scale);
882 virtual void Add(BaseFloat alpha, const Component &other);
883 // copy constructor
884 explicit NaturalGradientAffineComponent(
885 const NaturalGradientAffineComponent &other);
886 private:
887 // disallow assignment operator.
888 NaturalGradientAffineComponent &operator= (
889 const NaturalGradientAffineComponent&);
891 // Configs for preconditioner. The input side tends to be better conditioned ->
892 // smaller rank needed, so make them separately configurable.
893 int32 rank_in_;
894 int32 rank_out_;
895 int32 update_period_;
896 BaseFloat num_samples_history_;
897 BaseFloat alpha_;
899 OnlineNaturalGradient preconditioner_in_;
901 OnlineNaturalGradient preconditioner_out_;
903 // Sets the configs rank, alpha and eta in the preconditioner objects,
904 // from the class variables.
905 void SetNaturalGradientConfigs();
907 virtual void Update(
908 const std::string &debug_info,
909 const CuMatrixBase<BaseFloat> &in_value,
910 const CuMatrixBase<BaseFloat> &out_deriv);
911 };
914 /// FixedAffineComponent is an affine transform that is supplied
915 /// at network initialization time and is not trainable.
916 class FixedAffineComponent: public Component {
917 public:
918 FixedAffineComponent() { }
919 virtual std::string Type() const { return "FixedAffineComponent"; }
920 virtual std::string Info() const;
922 // Copy constructor from AffineComponent-- can be used when we're done
923 // training a particular part of the model and want to efficiently disable
924 // further training.
925 FixedAffineComponent(const AffineComponent &c);
927 /// matrix should be of size input-dim+1 to output-dim, last col is offset
928 void Init(const CuMatrixBase<BaseFloat> &matrix);
930 // The ConfigLine cfl contains just the option matrix=<string>,
931 // where the string is the filename of a Kaldi-format matrix to read.
932 virtual void InitFromConfig(ConfigLine *cfl);
934 virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
935 virtual int32 InputDim() const { return linear_params_.NumCols(); }
936 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
938 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
939 const CuMatrixBase<BaseFloat> &in,
940 CuMatrixBase<BaseFloat> *out) const;
941 virtual void Backprop(const std::string &debug_info,
942 const ComponentPrecomputedIndexes *indexes,
943 const CuMatrixBase<BaseFloat> &in_value,
944 const CuMatrixBase<BaseFloat> &, // out_value
945 const CuMatrixBase<BaseFloat> &out_deriv,
946 Component *to_update,
947 CuMatrixBase<BaseFloat> *in_deriv) const;
950 virtual Component* Copy() const;
951 virtual void Read(std::istream &is, bool binary);
952 virtual void Write(std::ostream &os, bool binary) const;
954 // Function to provide access to linear_params_.
955 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
956 protected:
957 friend class AffineComponent;
958 CuMatrix<BaseFloat> linear_params_;
959 CuVector<BaseFloat> bias_params_;
961 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
962 };
964 /// SumGroupComponent is used to sum up groups of posteriors.
965 /// It's used to introduce a kind of Gaussian-mixture-model-like
966 /// idea into neural nets. This is basically a degenerate case of
967 /// MixtureProbComponent; we had to implement it separately to
968 /// be efficient for CUDA (we can use this one regardless whether
969 /// we have CUDA or not; it's the normal case we want anyway).
970 ///
971 /// There are two forms of initialization in a config file: one
972 /// where the number of elements are specified for each group
973 /// individually as a vector, and one where only the total input
974 /// dimension and the output dimension (number of groups) is specified.
975 /// The second is used when all groups have the same size.
976 class SumGroupComponent: public Component {
977 public:
978 virtual int32 InputDim() const { return input_dim_; }
979 virtual int32 OutputDim() const { return output_dim_; }
980 void Init(const std::vector<int32> &sizes); // the vector is of the input dim
981 // (>= 1) for each output dim.
982 void Init(int32 input_dim, int32 output_dim);
983 void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
984 // each output-dim, how many
985 // inputs were summed over.
986 virtual void InitFromConfig(ConfigLine *cfl);
987 SumGroupComponent() { }
988 virtual std::string Type() const { return "SumGroupComponent"; }
989 virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
990 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
991 const CuMatrixBase<BaseFloat> &in,
992 CuMatrixBase<BaseFloat> *out) const;
993 virtual void Backprop(const std::string &debug_info,
994 const ComponentPrecomputedIndexes *indexes,
995 const CuMatrixBase<BaseFloat> &in_value,
996 const CuMatrixBase<BaseFloat> &, // out_value
997 const CuMatrixBase<BaseFloat> &out_deriv,
998 Component *to_update,
999 CuMatrixBase<BaseFloat> *in_deriv) const;
1000 virtual Component* Copy() const;
1001 virtual void Read(std::istream &is, bool binary);
1002 virtual void Write(std::ostream &os, bool binary) const;
1004 private:
1005 KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
1006 // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
1007 // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
1008 CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
1009 // index.
1010 CuArray<int32> reverse_indexes_; // for each input index, the output index.
1011 int32 input_dim_;
1012 int32 output_dim_;
1013 };
1016 /// FixedScaleComponent applies a fixed per-element scale; it's similar
1017 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
1018 /// model conversion).
1019 class FixedScaleComponent: public Component {
1020 public:
1021 FixedScaleComponent() { }
1022 virtual std::string Type() const { return "FixedScaleComponent"; }
1023 virtual std::string Info() const;
1024 virtual int32 Properties() const {
1025 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
1026 }
1028 void Init(const CuVectorBase<BaseFloat> &scales);
1030 // The ConfigLine cfl contains only the option scales=<string>,
1031 // where the string is the filename of a Kaldi-format matrix to read.
1032 virtual void InitFromConfig(ConfigLine *cfl);
1034 virtual int32 InputDim() const { return scales_.Dim(); }
1035 virtual int32 OutputDim() const { return scales_.Dim(); }
1037 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1038 const CuMatrixBase<BaseFloat> &in,
1039 CuMatrixBase<BaseFloat> *out) const;
1040 virtual void Backprop(const std::string &debug_info,
1041 const ComponentPrecomputedIndexes *indexes,
1042 const CuMatrixBase<BaseFloat> &, // in_value
1043 const CuMatrixBase<BaseFloat> &, // out_value
1044 const CuMatrixBase<BaseFloat> &out_deriv,
1045 Component *, // to_update
1046 CuMatrixBase<BaseFloat> *in_deriv) const;
1047 virtual Component* Copy() const;
1048 virtual void Read(std::istream &is, bool binary);
1049 virtual void Write(std::ostream &os, bool binary) const;
1051 protected:
1052 friend class AffineComponent; // necessary for collapse
1053 CuVector<BaseFloat> scales_;
1054 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
1055 };
1058 /// FixedBiasComponent applies a fixed per-element bias; it's similar
1059 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
1060 /// model conversion.
1061 class FixedBiasComponent: public Component {
1062 public:
1063 FixedBiasComponent() { }
1064 virtual std::string Type() const { return "FixedBiasComponent"; }
1065 virtual std::string Info() const;
1067 virtual int32 Properties() const {
1068 return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
1069 }
1071 void Init(const CuVectorBase<BaseFloat> &scales);
1073 // The ConfigLine cfl contains only the option bias=<string>,
1074 // where the string is the filename of a Kaldi-format matrix to read.
1075 virtual void InitFromConfig(ConfigLine *cfl);
1076 virtual int32 InputDim() const { return bias_.Dim(); }
1077 virtual int32 OutputDim() const { return bias_.Dim(); }
1078 using Component::Propagate; // to avoid name hiding
1079 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1080 const CuMatrixBase<BaseFloat> &in,
1081 CuMatrixBase<BaseFloat> *out) const;
1082 virtual void Backprop(const std::string &debug_info,
1083 const ComponentPrecomputedIndexes *indexes,
1084 const CuMatrixBase<BaseFloat> &, // in_value,
1085 const CuMatrixBase<BaseFloat> &, // out_value
1086 const CuMatrixBase<BaseFloat> &out_deriv,
1087 Component *, // to_update
1088 CuMatrixBase<BaseFloat> *in_deriv) const;
1089 virtual Component* Copy() const;
1090 virtual void Read(std::istream &is, bool binary);
1091 virtual void Write(std::ostream &os, bool binary) const;
1093 protected:
1094 CuVector<BaseFloat> bias_;
1095 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
1096 };
1098 // NoOpComponent just duplicates its input. We don't anticipate this being used
1099 // very often, but it may sometimes make your life easier
1100 class NoOpComponent: public NonlinearComponent {
1101 public:
1102 explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
1103 NoOpComponent() { }
1104 virtual std::string Type() const { return "NoOpComponent"; }
1105 virtual int32 Properties() const {
1106 return kSimpleComponent|kLinearInInput|kPropagateInPlace;
1107 }
1108 virtual Component* Copy() const { return new NoOpComponent(*this); }
1109 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1110 const CuMatrixBase<BaseFloat> &in,
1111 CuMatrixBase<BaseFloat> *out) const;
1112 virtual void Backprop(const std::string &debug_info,
1113 const ComponentPrecomputedIndexes *indexes,
1114 const CuMatrixBase<BaseFloat> &, //in_value
1115 const CuMatrixBase<BaseFloat> &, // out_value,
1116 const CuMatrixBase<BaseFloat> &out_deriv,
1117 Component *to_update,
1118 CuMatrixBase<BaseFloat> *in_deriv) const;
1119 private:
1120 NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
1121 };
1123 // ClipGradientComponent just duplicates its input, but clips gradients
1124 // during backpropagation if they cross a predetermined threshold.
1125 // This component will be used to prevent gradient explosion problem in
1126 // recurrent neural networks
1127 class ClipGradientComponent: public Component {
1128 public:
1129 ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1130 bool norm_based_clipping,
1131 BaseFloat self_repair_clipped_proportion_threshold,
1132 BaseFloat self_repair_target,
1133 BaseFloat self_repair_scale,
1134 int32 num_clipped,
1135 int32 count,
1136 int32 num_self_repaired,
1137 int32 num_backpropped) {
1138 Init(dim, clipping_threshold, norm_based_clipping,
1139 self_repair_clipped_proportion_threshold,
1140 self_repair_target,
1141 self_repair_scale,
1142 num_clipped, count,
1143 num_self_repaired, num_backpropped);}
1145 ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1146 norm_based_clipping_(false),
1147 self_repair_clipped_proportion_threshold_(1.0),
1148 self_repair_target_(0.0),
1149 self_repair_scale_(0.0),
1150 num_clipped_(0), count_(0),
1151 num_self_repaired_(0), num_backpropped_(0) { }
1153 virtual int32 InputDim() const { return dim_; }
1154 virtual int32 OutputDim() const { return dim_; }
1155 virtual void InitFromConfig(ConfigLine *cfl);
1156 void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1157 BaseFloat self_repair_clipped_proportion_threshold,
1158 BaseFloat self_repair_target,
1159 BaseFloat self_repair_scale,
1160 int32 num_clipped, int32 count,
1161 int32 num_self_repaired, int32 num_backpropped);
1163 virtual std::string Type() const { return "ClipGradientComponent"; }
1165 virtual int32 Properties() const {
1166 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1167 kBackpropNeedsInput;
1168 }
1170 virtual void ZeroStats();
1172 virtual Component* Copy() const {
1173 return new ClipGradientComponent(dim_,
1174 clipping_threshold_,
1175 norm_based_clipping_,
1176 self_repair_clipped_proportion_threshold_,
1177 self_repair_target_,
1178 self_repair_scale_,
1179 num_clipped_,
1180 count_,
1181 num_self_repaired_,
1182 num_backpropped_);}
1184 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1185 const CuMatrixBase<BaseFloat> &in,
1186 CuMatrixBase<BaseFloat> *out) const;
1187 virtual void Backprop(const std::string &debug_info,
1188 const ComponentPrecomputedIndexes *indexes,
1189 const CuMatrixBase<BaseFloat> &in_value,
1190 const CuMatrixBase<BaseFloat> &, // out_value,
1191 const CuMatrixBase<BaseFloat> &out_deriv,
1192 Component *to_update,
1193 CuMatrixBase<BaseFloat> *in_deriv) const;
1195 virtual void Scale(BaseFloat scale);
1196 virtual void Add(BaseFloat alpha, const Component &other);
1197 virtual void Read(std::istream &is, bool binary); // This Read function
1198 // requires that the Component has the correct type.
1199 /// Write component to stream
1200 virtual void Write(std::ostream &os, bool binary) const;
1201 virtual std::string Info() const;
1202 virtual ~ClipGradientComponent() {
1203 if (num_self_repaired_ > 0)
1204 KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1205 << ")'s self-repair was activated " << num_self_repaired_
1206 << " time(s) out of " << num_backpropped_
1207 << " times of calling Backprop() in this training job.";
1208 }
1209 private:
1210 int32 dim_; // input/output dimension
1211 BaseFloat clipping_threshold_; // threshold to be used for clipping
1212 // could correspond to max-row-norm (if
1213 // norm_based_clipping_ == true) or
1214 // max-absolute-value (otherwise)
1215 bool norm_based_clipping_; // if true the max-row-norm will be clipped
1216 // else element-wise absolute value clipping is
1217 // done
1219 // some configuration values relating to self-repairing.
1220 BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1221 // clipped-proportion
1222 // for self-repair to be
1223 // activated
1224 BaseFloat self_repair_target_; // the target value towards which self-repair
1225 // is trying to set for in-deriv
1226 BaseFloat self_repair_scale_; // constant scaling the self-repair vector
1227 std::string debug_info_; // component-node name, used in the destructor to
1228 // print out stats of self-repair
1230 // this function is called from Backprop code, and only does something if the
1231 // self-repair-scale config value is set and the current clipped proportion
1232 // exceeds the threshold. What it does is to add a term to in-deriv that
1233 // forces the input to the ClipGradientComponent to be close to some small
1234 // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1235 // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1236 // small, the parameters on the path will also tend to be small, which may
1237 // help tamp down the divergence caused by gradient explosion.
1238 void RepairGradients(const std::string &debug_info,
1239 const CuMatrixBase<BaseFloat> &in_value,
1240 CuMatrixBase<BaseFloat> *in_deriv,
1241 ClipGradientComponent *to_update) const;
1243 ClipGradientComponent &operator =
1244 (const ClipGradientComponent &other); // Disallow.
1246 protected:
1247 // variables to store stats
1248 // An element corresponds to rows of derivative matrix, when
1249 // norm_based_clipping_ is true,
1250 // else it corresponds to each element of the derivative matrix
1251 // Note: no stats are stored when norm_based_clipping_ is false
1252 int32 num_clipped_; // number of elements which were clipped
1253 int32 count_; // number of elements which were processed
1254 int32 num_self_repaired_; // number of times self-repair is activated
1255 int32 num_backpropped_; //number of times backprop is called
1257 };
1259 /** PermuteComponent changes the order of the columns (i.e. the feature or
1260 activation dimensions). Output dimension i is mapped to input dimension
1261 column_map_[i], so it's like doing:
1262 for each row:
1263 for each feature/activation dimension i:
1264 output(row, i) = input(row, column_map_[i]).
1266 */
1267 class PermuteComponent: public Component {
1268 public:
1269 PermuteComponent() {}
1270 PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1272 virtual int32 InputDim() const { return column_map_.Dim(); }
1273 virtual int32 OutputDim() const { return column_map_.Dim(); }
1274 virtual void InitFromConfig(ConfigLine *cfl);
1275 void Init(const std::vector<int32> &column_map);
1277 virtual std::string Type() const { return "PermuteComponent"; }
1279 virtual int32 Properties() const {
1280 return kSimpleComponent|kLinearInInput;
1281 }
1283 virtual void ZeroStats() {}
1285 virtual Component* Copy() const;
1287 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1288 const CuMatrixBase<BaseFloat> &in,
1289 CuMatrixBase<BaseFloat> *out) const;
1290 virtual void Backprop(const std::string &debug_info,
1291 const ComponentPrecomputedIndexes *indexes,
1292 const CuMatrixBase<BaseFloat> &, //in_value
1293 const CuMatrixBase<BaseFloat> &, // out_value,
1294 const CuMatrixBase<BaseFloat> &out_deriv,
1295 Component *to_update,
1296 CuMatrixBase<BaseFloat> *in_deriv) const;
1298 virtual void Scale(BaseFloat scale) {}
1299 virtual void Add(BaseFloat alpha, const Component &other) {}
1300 virtual void Read(std::istream &is, bool binary); // This Read function
1301 // requires that the Component has the correct type.
1302 /// Write component to stream
1303 virtual void Write(std::ostream &os, bool binary) const;
1304 virtual std::string Info() const;
1305 private:
1306 // computes the reverse column map. Must not be called if column_map_.Dim()
1307 // == 0
1308 void ComputeReverseColumnMap();
1309 CuArray<int32> column_map_;
1310 // the following is a derived variable, not written to disk.
1311 // It is used in backprop.
1312 CuArray<int32> reverse_column_map_;
1313 PermuteComponent &operator =
1314 (const PermuteComponent &other); // Disallow.
1315 };
1320 // PerElementScaleComponent scales each dimension of its input with a separate
1321 // trainable scale; it's like a linear component with a diagonal matrix.
1322 class PerElementScaleComponent: public UpdatableComponent {
1323 public:
1324 virtual int32 InputDim() const { return scales_.Dim(); }
1325 virtual int32 OutputDim() const { return scales_.Dim(); }
1327 virtual std::string Info() const;
1328 virtual void InitFromConfig(ConfigLine *cfl);
1330 PerElementScaleComponent() { } // use Init to really initialize.
1331 virtual std::string Type() const { return "PerElementScaleComponent"; }
1332 virtual int32 Properties() const {
1333 return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1334 kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1335 }
1337 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1338 const CuMatrixBase<BaseFloat> &in,
1339 CuMatrixBase<BaseFloat> *out) const;
1340 virtual void Backprop(const std::string &debug_info,
1341 const ComponentPrecomputedIndexes *indexes,
1342 const CuMatrixBase<BaseFloat> &in_value,
1343 const CuMatrixBase<BaseFloat> &, // out_value
1344 const CuMatrixBase<BaseFloat> &out_deriv,
1345 Component *to_update,
1346 CuMatrixBase<BaseFloat> *in_deriv) const;
1348 virtual void Read(std::istream &is, bool binary);
1349 virtual void Write(std::ostream &os, bool binary) const;
1351 virtual Component* Copy() const;
1354 // Some functions from base-class UpdatableComponent.
1355 virtual void Scale(BaseFloat scale);
1356 virtual void Add(BaseFloat alpha, const Component &other);
1357 virtual void PerturbParams(BaseFloat stddev);
1358 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1359 virtual int32 NumParameters() const;
1360 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1361 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1363 // Some functions that are specific to this class.
1364 explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1366 void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1367 void Init(std::string vector_filename);
1369 protected:
1370 friend class AffineComponent; // necessary for collapse
1371 // This function Update() is for extensibility; child classes may override
1372 // this, e.g. for natural gradient update.
1373 virtual void Update(
1374 const std::string &debug_info,
1375 const CuMatrixBase<BaseFloat> &in_value,
1376 const CuMatrixBase<BaseFloat> &out_deriv) {
1377 UpdateSimple(in_value, out_deriv);
1378 }
1379 // UpdateSimple is used when *this is a gradient. Child classes may override
1380 // this if needed, but typically won't need to.
1381 virtual void UpdateSimple(
1382 const CuMatrixBase<BaseFloat> &in_value,
1383 const CuMatrixBase<BaseFloat> &out_deriv);
1385 const PerElementScaleComponent &operator
1386 = (const PerElementScaleComponent &other); // Disallow.
1387 CuVector<BaseFloat> scales_;
1388 };
1391 // PerElementOffsetComponent offsets each dimension of its input with a separate
1392 // trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
1393 class PerElementOffsetComponent: public UpdatableComponent {
1394 public:
1395 virtual int32 InputDim() const { return offsets_.Dim(); }
1396 virtual int32 OutputDim() const { return offsets_.Dim(); }
1398 virtual std::string Info() const;
1399 virtual void InitFromConfig(ConfigLine *cfl);
1401 PerElementOffsetComponent() { } // use Init to really initialize.
1402 virtual std::string Type() const { return "PerElementOffsetComponent"; }
1403 virtual int32 Properties() const {
1404 return kSimpleComponent|kUpdatableComponent|
1405 kBackpropInPlace|kPropagateInPlace;
1406 }
1408 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1409 const CuMatrixBase<BaseFloat> &in,
1410 CuMatrixBase<BaseFloat> *out) const;
1411 virtual void Backprop(const std::string &debug_info,
1412 const ComponentPrecomputedIndexes *indexes,
1413 const CuMatrixBase<BaseFloat> &, // in_value
1414 const CuMatrixBase<BaseFloat> &, // out_value
1415 const CuMatrixBase<BaseFloat> &out_deriv,
1416 Component *to_update,
1417 CuMatrixBase<BaseFloat> *in_deriv) const;
1419 virtual void Read(std::istream &is, bool binary);
1420 virtual void Write(std::ostream &os, bool binary) const;
1422 virtual Component* Copy() const;
1425 // Some functions from base-class UpdatableComponent.
1426 virtual void Scale(BaseFloat scale);
1427 virtual void Add(BaseFloat alpha, const Component &other);
1428 virtual void PerturbParams(BaseFloat stddev);
1429 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1430 virtual int32 NumParameters() const;
1431 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1432 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1434 // Some functions that are specific to this class.
1435 explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1437 void Init(int32 dim, BaseFloat param_mean,
1438 BaseFloat param_stddev);
1439 void Init(std::string vector_filename);
1441 protected:
1442 const PerElementOffsetComponent &operator
1443 = (const PerElementOffsetComponent &other); // Disallow.
1444 CuVector<BaseFloat> offsets_;
1445 };
1448 // ConstantFunctionComponent returns constant function of its input,
1449 // i.e. its output does not depend on its input. It is the same as
1450 // an affine component with the linear term fixed at zero.
1451 // It is optionally trainable, and optionally you can use natural
1452 // gradient. The input is required only because it's more convenient
1453 // to make SimpleComponents [but see ConstantComponent, which requires
1454 // no inputs].
1455 class ConstantFunctionComponent: public UpdatableComponent {
1456 public:
1457 virtual int32 InputDim() const { return input_dim_; }
1458 virtual int32 OutputDim() const { return output_.Dim(); }
1460 virtual std::string Info() const;
1461 // possible parameter values with their defaults:
1462 // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1463 // output-mean=0 output-stddev=0
1464 virtual void InitFromConfig(ConfigLine *cfl);
1466 ConstantFunctionComponent();
1468 ConstantFunctionComponent(const ConstantFunctionComponent &other);
1470 virtual std::string Type() const { return "ConstantFunctionComponent"; }
1471 virtual int32 Properties() const {
1472 return kSimpleComponent|
1473 (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1474 (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1475 kBackpropAdds;
1476 }
1477 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1478 const CuMatrixBase<BaseFloat> &in,
1479 CuMatrixBase<BaseFloat> *out) const;
1480 virtual void Backprop(const std::string &debug_info,
1481 const ComponentPrecomputedIndexes *indexes,
1482 const CuMatrixBase<BaseFloat> &, // in_value
1483 const CuMatrixBase<BaseFloat> &, // out_value
1484 const CuMatrixBase<BaseFloat> &out_deriv,
1485 Component *to_update,
1486 CuMatrixBase<BaseFloat> *in_deriv) const;
1488 virtual void Read(std::istream &is, bool binary);
1489 virtual void Write(std::ostream &os, bool binary) const;
1491 virtual Component* Copy() const;
1493 // Some functions from base-class UpdatableComponent.
1494 virtual void Scale(BaseFloat scale);
1495 virtual void Add(BaseFloat alpha, const Component &other);
1496 virtual void PerturbParams(BaseFloat stddev);
1497 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1498 virtual int32 NumParameters() const;
1499 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1500 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1501 private:
1502 int32 input_dim_;
1503 // the output value-- a vector.
1504 CuVector<BaseFloat> output_;
1506 bool is_updatable_;
1507 // if true, and if updatable, do natural-gradient update.
1508 bool use_natural_gradient_;
1509 OnlineNaturalGradient preconditioner_;
1511 const ConstantFunctionComponent &operator
1512 = (const ConstantFunctionComponent &other); // Disallow.
1513 };
1517 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1518 // it uses a natural gradient update for the per-element scales, and enforces a
1519 // maximum amount of change per minibatch, for stability.
1520 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1521 public:
1523 virtual std::string Info() const;
1525 virtual void InitFromConfig(ConfigLine *cfl);
1527 NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1528 virtual std::string Type() const {
1529 return "NaturalGradientPerElementScaleComponent";
1530 }
1532 virtual void Read(std::istream &is, bool binary);
1533 virtual void Write(std::ostream &os, bool binary) const;
1535 virtual Component* Copy() const;
1537 // Some functions that are specific to this class:
1538 explicit NaturalGradientPerElementScaleComponent(
1539 const NaturalGradientPerElementScaleComponent &other);
1541 void Init(int32 dim, BaseFloat param_mean,
1542 BaseFloat param_stddev, int32 rank, int32 update_period,
1543 BaseFloat num_samples_history, BaseFloat alpha);
1544 void Init(std::string vector_filename,
1545 int32 rank, int32 update_period, BaseFloat num_samples_history,
1546 BaseFloat alpha);
1548 private:
1549 // unlike the NaturalGradientAffineComponent, there is only one dimension to
1550 // consider as the parameters are a vector not a matrix, so we only need one
1551 // preconditioner.
1552 // The preconditioner stores its own configuration values; we write and read
1553 // these, but not the preconditioner object itself.
1554 OnlineNaturalGradient preconditioner_;
1556 // Override of the parent-class Update() function, called only
1557 // if this->is_gradient_ = false; this implements the natural
1558 // gradient update.
1559 virtual void Update(
1560 const std::string &debug_info,
1561 const CuMatrixBase<BaseFloat> &in_value,
1562 const CuMatrixBase<BaseFloat> &out_deriv);
1564 const NaturalGradientPerElementScaleComponent &operator
1565 = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1566 };
1568 /**
1569 * ConvolutionalComponent implements 2d-convolution.
1570 * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1571 * 2 dimensions as it has same size as the input along the 3rd dimension.
1572 * Input : A matrix where each row is a vectorized 3D-tensor.
1573 * The 3D tensor has dimensions
1574 * x: (e.g. time)
1575 * y: (e.g. frequency)
1576 * z: (e.g. channels like features/delta/delta-delta)
1577 *
1578 * The component supports input vectorizations of type zyx and yzx.
1579 * The default vectorization type is zyx.
1580 * e.g. for input vectorization of type zyx the input is vectorized by
1581 * spanning axes z, y and x of the tensor in that order.
1582 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1583 * the zyx vectorized input looks like
1584 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1585 *
1586 *
1587 * Output : The output is also a 3D tensor vectorized in the zyx format.
1588 * The channel axis (z) in the output corresponds to the output of
1589 * different filters. The first channel corresponds to the first filter
1590 * i.e., first row of the filter_params_ matrix.
1591 *
1592 * Note: The component has to support yzx input vectorization as the binaries
1593 * like add-deltas generate yz vectorized output. These input vectors are
1594 * concatenated using the Append descriptor across time steps to form a yzx
1595 * vectorized 3D tensor input.
1596 * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1597 *
1598 *
1599 * For information on the hyperparameters and parameters of this component see
1600 * the variable declarations.
1601 *
1602 * Propagation:
1603 * ------------
1604 * Convolution operation consists of a dot-products between the filter tensor
1605 * and input tensor patch, for various shifts of filter tensor along the x and y
1606 * axes input tensor. (Note: there is no shift along z-axis as the filter and
1607 * input tensor have same size along this axis).
1608 *
1609 * For a particular shift (i,j) of the filter tensor
1610 * along input tensor dimensions x and y, the elements of the input tensor which
1611 * overlap with the filter form the input tensor patch. This patch is vectorized
1612 * in zyx format. All the patches corresponding to various samples in the
1613 * mini-batch are stacked into a matrix, where each row corresponds to one
1614 * patch. Let this matrix be represented by X_{i,j}. The dot products with
1615 * various filters are computed simultaneously by computing the matrix product
1616 * with the filter_params_ matrix (W)
1617 * Y_{i,j} = X_{i,j}*W^T.
1618 * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1619 *
1620 * All the matrix products corresponding to various shifts (i,j) of the
1621 * filter tensor are computed simultaneously using the AddMatMatBatched
1622 * call of CuMatrixBase class.
1623 *
1624 * BackPropagation:
1625 * ----------------
1626 * Backpropagation to compute the input derivative (\nabla X_{i,j})
1627 * consists of the a series of matrix products.
1628 * \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1629 * output derivative for a particular shift of the filter.
1630 *
1631 * Once again these matrix products are computed simultaneously.
1632 *
1633 * Update:
1634 * -------
1635 * The weight gradient is computed as
1636 * \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1637 *
1638 */
1639 class ConvolutionComponent: public UpdatableComponent {
1640 public:
1641 enum TensorVectorizationType {
1642 kYzx = 0,
1643 kZyx = 1
1644 };
1646 ConvolutionComponent();
1647 // constructor using another component
1648 ConvolutionComponent(const ConvolutionComponent &component);
1649 // constructor using parameters
1650 ConvolutionComponent(
1651 const CuMatrixBase<BaseFloat> &filter_params,
1652 const CuVectorBase<BaseFloat> &bias_params,
1653 int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1654 int32 filt_x_dim, int32 filt_y_dim,
1655 int32 filt_x_step, int32 filt_y_step,
1656 TensorVectorizationType input_vectorization,
1657 BaseFloat learning_rate);
1659 virtual int32 InputDim() const;
1660 virtual int32 OutputDim() const;
1662 virtual std::string Info() const;
1663 virtual void InitFromConfig(ConfigLine *cfl);
1664 virtual std::string Type() const { return "ConvolutionComponent"; }
1665 virtual int32 Properties() const {
1666 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1667 kBackpropAdds|kPropagateAdds;
1668 }
1670 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1671 const CuMatrixBase<BaseFloat> &in,
1672 CuMatrixBase<BaseFloat> *out) const;
1673 virtual void Backprop(const std::string &debug_info,
1674 const ComponentPrecomputedIndexes *indexes,
1675 const CuMatrixBase<BaseFloat> &in_value,
1676 const CuMatrixBase<BaseFloat> &, // out_value,
1677 const CuMatrixBase<BaseFloat> &out_deriv,
1678 Component *to_update_in,
1679 CuMatrixBase<BaseFloat> *in_deriv) const;
1680 void Update(const std::string &debug_info,
1681 const CuMatrixBase<BaseFloat> &in_value,
1682 const CuMatrixBase<BaseFloat> &out_deriv,
1683 const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1686 virtual void Read(std::istream &is, bool binary);
1687 virtual void Write(std::ostream &os, bool binary) const;
1689 virtual Component* Copy() const;
1691 // Some functions from base-class UpdatableComponent.
1692 virtual void Scale(BaseFloat scale);
1693 virtual void Add(BaseFloat alpha, const Component &other);
1694 virtual void PerturbParams(BaseFloat stddev);
1695 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1696 virtual int32 NumParameters() const;
1697 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1698 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1700 // Some functions that are specific to this class.
1701 void SetParams(const VectorBase<BaseFloat> &bias,
1702 const MatrixBase<BaseFloat> &filter);
1703 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1704 const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1705 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1706 int32 filt_x_dim, int32 filt_y_dim,
1707 int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1708 TensorVectorizationType input_vectorization,
1709 BaseFloat param_stddev, BaseFloat bias_stddev);
1710 // there is no filt_z_dim parameter as the length of the filter along
1711 // z-dimension is same as the input
1712 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1713 int32 filt_x_dim, int32 filt_y_dim,
1714 int32 filt_x_step, int32 filt_y_step,
1715 TensorVectorizationType input_vectorization,
1716 std::string matrix_filename);
1718 // resize the component, setting the parameters to zero, while
1719 // leaving any other configuration values the same
1720 void Resize(int32 input_dim, int32 output_dim);
1722 void Update(const std::string &debug_info,
1723 const CuMatrixBase<BaseFloat> &in_value,
1724 const CuMatrixBase<BaseFloat> &out_deriv);
1727 private:
1728 int32 input_x_dim_; // size of the input along x-axis
1729 // (e.g. number of time steps)
1731 int32 input_y_dim_; // size of input along y-axis
1732 // (e.g. number of mel-frequency bins)
1734 int32 input_z_dim_; // size of input along z-axis
1735 // (e.g. number of channels is 3 if the input has
1736 // features + delta + delta-delta features
1738 int32 filt_x_dim_; // size of the filter along x-axis
1740 int32 filt_y_dim_; // size of the filter along y-axis
1742 // there is no filt_z_dim_ as it is always assumed to be
1743 // the same as input_z_dim_
1745 int32 filt_x_step_; // the number of steps taken along x-axis of input
1746 // before computing the next dot-product
1747 // of filter and input
1749 int32 filt_y_step_; // the number of steps taken along y-axis of input
1750 // before computing the next dot-product of the filter
1751 // and input
1753 // there is no filt_z_step_ as only dot product is possible along this axis
1755 TensorVectorizationType input_vectorization_; // type of vectorization of the
1756 // input 3D tensor. Accepts zyx and yzx formats
1758 CuMatrix<BaseFloat> filter_params_;
1759 // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1760 // where each row in the matrix corresponds to one filter.
1761 // The 3D filter tensor is vectorizedin zyx format.
1762 // The first row of the matrix corresponds to the first filter and so on.
1763 // Keep in mind the vectorization type and order of filters when using file
1764 // based initialization.
1766 CuVector<BaseFloat> bias_params_;
1767 // the filter-specific bias vector (i.e., there is a seperate bias added
1768 // to the output of each filter).
1769 bool is_gradient_;
1771 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1772 CuMatrix<BaseFloat> *patches) const;
1773 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1774 CuMatrixBase<BaseFloat> *in_deriv) const;
1775 const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1776 };
1779 /*
1780 LstmNonlinearityComponent is a component that implements part of an LSTM, by
1781 combining together the sigmoids and tanh's, plus some diagonal terms, into
1782 a single block.
1783 We will refer to the LSTM formulation used in
1785 Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1786 by H. Sak et al,
1787 http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1789 Suppose the cell dimension is C. Then outside this component, we compute
1790 the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1791 matrix multiplication:
1793 i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1794 f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1795 c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1796 o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1798 The part of the computation that takes place in this component is as follows.
1799 Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
1800 c_{t-1}). Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1802 To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1805 This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1806 and w_o.
1809 In the forward pass (Propagate), this component computes the following:
1811 i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) (1)
1812 f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) (2)
1813 c_t = f_t*c_{t-1} + i_t * Tanh(c_part) (3)
1814 o_t = Sigmoid(o_part + w_{oc}*c_t) (4)
1815 m_t = o_t * Tanh(c_t) (5)
1816 # note: the outputs are just c_t and m_t.
1818 The backprop is as you would think, but for the "self-repair" we need to pass
1819 in additional vectors (of the same dim as the parameters of the layer) that
1820 dictate whether or not we add an additional term to the backpropagated
1821 derivatives. (This term helps force the input to the nonlinearities into the
1822 range where the derivatives are not too small).
1824 This component stores stats of the same form as are normally stored by the
1825 StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1826 activations and derivatives, but this is done inside the Backprop() functions.
1827 [the StoreStats() functions don't take the input data as an argument, so
1828 storing this data that way is impossible, and anyway it's more efficient to
1829 do it as part of backprop.]
1831 Configuration values accepted:
1832 cell-dim e.g. cell-dim=1024 Cell dimension. The input
1833 dimension of this component is cell-dim * 5, and the
1834 output dimension is cell-dim * 2. Note: this
1835 component implements only part of the LSTM layer,
1836 see comments above.
1837 param-stddev Standard deviation for random initialization of
1838 the diagonal matrices (AKA peephole connections).
1839 default=1.0, which is probably too high but
1840 we couldn't see any reliable gain from decreasing it.
1841 tanh-self-repair-threshold Equivalent to the self-repair-lower-threshold
1842 in a TanhComponent; applies to both the tanh nonlinearities.
1843 default=0.2, you probably won't want to changethis.
1844 sigmoid-self-repair-threshold Equivalent to self-repair-lower-threshold
1845 in a SigmoidComponent; applies to all three of the sigmoid
1846 nonlinearities. default=0.05, you probably won't want to
1847 change this.
1848 self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
1849 or TanhComponent; applies to both the sigmoid and tanh
1850 nonlinearities. default=1.0e-05, which you probably won't
1851 want to change unless dealing with an objective function
1852 that has smaller or larger dynamic range than normal, in
1853 which case you might want to make it smaller or larger.
1854 */
1855 class LstmNonlinearityComponent: public UpdatableComponent {
1856 public:
1858 virtual int32 InputDim() const;
1859 virtual int32 OutputDim() const;
1860 virtual std::string Info() const;
1861 virtual void InitFromConfig(ConfigLine *cfl);
1862 LstmNonlinearityComponent() { } // use Init to really initialize.
1863 virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1864 virtual int32 Properties() const {
1865 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1866 }
1868 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1869 const CuMatrixBase<BaseFloat> &in,
1870 CuMatrixBase<BaseFloat> *out) const;
1871 virtual void Backprop(const std::string &debug_info,
1872 const ComponentPrecomputedIndexes *indexes,
1873 const CuMatrixBase<BaseFloat> &in_value,
1874 const CuMatrixBase<BaseFloat> &, // out_value,
1875 const CuMatrixBase<BaseFloat> &out_deriv,
1876 Component *to_update_in,
1877 CuMatrixBase<BaseFloat> *in_deriv) const;
1879 virtual void Read(std::istream &is, bool binary);
1880 virtual void Write(std::ostream &os, bool binary) const;
1882 virtual Component* Copy() const;
1884 // Some functions from base-class UpdatableComponent.
1885 virtual void Scale(BaseFloat scale);
1886 virtual void Add(BaseFloat alpha, const Component &other);
1887 virtual void PerturbParams(BaseFloat stddev);
1888 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1889 virtual int32 NumParameters() const;
1890 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1891 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1892 virtual void ZeroStats();
1894 // Some functions that are specific to this class:
1895 explicit LstmNonlinearityComponent(
1896 const LstmNonlinearityComponent &other);
1898 void Init(int32 cell_dim, BaseFloat param_stddev,
1899 BaseFloat tanh_self_repair_threshold,
1900 BaseFloat sigmoid_self_repair_threshold,
1901 BaseFloat self_repair_scale);
1903 void Init(std::string vector_filename,
1904 int32 rank, int32 update_period, BaseFloat num_samples_history,
1905 BaseFloat alpha);
1907 private:
1909 // Initializes the natural-gradient object with the configuration we
1910 // use for this object, which for now is hardcoded at the C++ level.
1911 void InitNaturalGradient();
1914 // Notation: C is the cell dimension; it equals params_.NumCols().
1916 // The dimension of the parameter matrix is (3 x C);
1917 // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1918 CuMatrix<BaseFloat> params_;
1920 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1921 // equations (1) through (5), this is the sum of the values of the nonliearities
1922 // (used for diagnostics only). It is comparable to value_sum_ vector
1923 // in base-class NonlinearComponent.
1924 CuMatrix<double> value_sum_;
1926 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1927 // equations (1) through (5), this is the sum of the derivatives of the
1928 // nonliearities (used for diagnostics and to control self-repair). It is
1929 // comparable to the deriv_sum_ vector in base-class
1930 // NonlinearComponent.
1931 CuMatrix<double> deriv_sum_;
1933 // This matrix has dimension 10. The contents are a block of 5 self-repair
1934 // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1935 // self-repair scales (typically all 0.00001). These are for each of the 5
1936 // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1937 // more info).
1938 CuVector<BaseFloat> self_repair_config_;
1940 // This matrix has dimension 5. For each of the 5 nonlinearities in the LSTM
1941 // component (see comments in cu-math.h for more info), it contains the total,
1942 // over all frames represented in count_, of the number of dimensions that
1943 // were subject to self_repair. To get the self-repair proportion you should
1944 // divide by (count_ times cell_dim_).
1945 CuVector<double> self_repair_total_;
1947 // The total count (number of frames) corresponding to the stats in value_sum_
1948 // and deriv_sum_.
1949 double count_;
1951 // Preconditioner for the parameters of this component [operates in the space
1952 // of dimension C].
1953 // The preconditioner stores its own configuration values; we write and read
1954 // these, but not the preconditioner object itself.
1955 OnlineNaturalGradient preconditioner_;
1957 const LstmNonlinearityComponent &operator
1958 = (const LstmNonlinearityComponent &other); // Disallow.
1959 };
1964 /*
1965 * MaxPoolingComponent :
1966 * Maxpooling component was firstly used in ConvNet for selecting an
1967 * representative activation in an area. It inspired Maxout nonlinearity.
1968 * Each output element of this component is the maximum of a block of
1969 * input elements where the block has a 3D dimension (pool_x_size_,
1970 * pool_y_size_, pool_z_size_).
1971 * Blocks could overlap if the shift value on any axis is smaller
1972 * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
1973 * If the shift values are euqal to their pool size, there is no
1974 * overlap; while if they all equal 1, the blocks overlap to
1975 * the greatest possible extent.
1976 *
1977 * This component is designed to be used after a ConvolutionComponent
1978 * so that the input matrix is propagated from a 2d-convolutional layer.
1979 * This component implements 3d-maxpooling which performs
1980 * max pooling along the three axes.
1981 * Input : A matrix where each row is a vectorized 3D-tensor.
1982 * The 3D tensor has dimensions
1983 * x: (e.g. time)
1984 * y: (e.g. frequency)
1985 * z: (e.g. channels like number of filters in the ConvolutionComponent)
1986 *
1987 * The component assumes input vectorizations of type zyx
1988 * which is the default output vectorization type of a ConvolutionComponent.
1989 * e.g. for input vectorization of type zyx the input is vectorized by
1990 * spanning axes z, y and x of the tensor in that order.
1991 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1992 * the zyx vectorized input looks like
1993 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1994 *
1995 * Output : The output is also a 3D tensor vectorized in the zyx format.
1996 *
1997 * For information on the hyperparameters and parameters of this component see
1998 * the variable declarations.
1999 *
2000 *
2001 */
2003 class MaxpoolingComponent: public Component {
2004 public:
2006 MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
2007 pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
2008 pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
2009 // constructor using another component
2010 MaxpoolingComponent(const MaxpoolingComponent &component);
2012 virtual int32 InputDim() const;
2013 virtual int32 OutputDim() const;
2014 virtual void Check() const;
2016 virtual std::string Info() const;
2017 virtual void InitFromConfig(ConfigLine *cfl);
2018 virtual std::string Type() const { return "MaxpoolingComponent"; }
2019 virtual int32 Properties() const {
2020 return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
2021 kBackpropAdds;
2022 }
2024 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
2025 const CuMatrixBase<BaseFloat> &in,
2026 CuMatrixBase<BaseFloat> *out) const;
2027 virtual void Backprop(const std::string &debug_info,
2028 const ComponentPrecomputedIndexes *indexes,
2029 const CuMatrixBase<BaseFloat> &in_value,
2030 const CuMatrixBase<BaseFloat> &out_value,
2031 const CuMatrixBase<BaseFloat> &out_deriv,
2032 Component *, // to_update,
2033 CuMatrixBase<BaseFloat> *in_deriv) const;
2035 virtual void Read(std::istream &is, bool binary); // This Read function
2036 // requires that the Component has the correct type.
2038 /// Write component to stream
2039 virtual void Write(std::ostream &os, bool binary) const;
2040 virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
2042 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
2043 CuMatrix<BaseFloat> *patches) const;
2044 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
2045 CuMatrixBase<BaseFloat> *in_deriv) const;
2047 protected:
2048 int32 input_x_dim_; // size of the input along x-axis
2049 // (e.g. number of time steps)
2050 int32 input_y_dim_; // size of input along y-axis
2051 // (e.g. number of mel-frequency bins)
2052 int32 input_z_dim_; // size of input along z-axis
2053 // (e.g. number of filters in the ConvolutionComponent)
2055 int32 pool_x_size_; // size of the pooling window along x-axis
2056 int32 pool_y_size_; // size of the pooling window along y-axis
2057 int32 pool_z_size_; // size of the pooling window along z-axis
2059 int32 pool_x_step_; // the number of steps taken along x-axis of input
2060 // before computing the next pool
2061 int32 pool_y_step_; // the number of steps taken along y-axis of input
2062 // before computing the next pool
2063 int32 pool_z_step_; // the number of steps taken along z-axis of input
2064 // before computing the next pool
2066 };
2069 /**
2070 CompositeComponent is a component representing a sequence of
2071 [simple] components. The config line would be something like the following
2072 (imagine this is all on one line):
2074 component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
2075 component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
2076 component2='type=RectifiedLinearComponent dim=10000' \
2077 component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
2079 The reason you might want to use this component, instead of directly using
2080 the same sequence of components in the config file, is to save GPU memory (at
2081 the expense of more compute)-- because doing it like this means we have to
2082 re-do parts of the forward pass in the backprop phase, but we avoid using
2083 much memory for very long (and you can make the memory usage very small by
2084 making max-rows-process small). We inherit from UpdatableComponent just in
2085 case one or more of the components in the sequence are updatable.
2087 It is an error to nest a CompositeComponent inside a CompositeComponent.
2088 The same effect can be accomplished by specifying a smaller max-rows-process
2089 in a single CompositeComponent.
2090 */
2091 class CompositeComponent: public UpdatableComponent {
2092 public:
2093 virtual int32 InputDim() const;
2094 virtual int32 OutputDim() const;
2096 virtual std::string Info() const;
2098 virtual void InitFromConfig(ConfigLine *cfl);
2100 virtual Component* Copy() const;
2102 CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
2104 // Initialize from this list of components; takes ownership of the pointers.
2105 void Init(const std::vector<Component*> &components,
2106 int32 max_rows_process);
2108 virtual std::string Type() const { return "CompositeComponent"; }
2110 // The properties depend on the properties of the constituent components. As
2111 // a special case, we never return kStoresStats in the properties: by default
2112 // we store things like activation stats (e.g. for nonlinear components like
2113 // ReLU) as part of the backprop. This means we may wastefully store stats
2114 // even when not requested, but it does save time as a separate StoreStats()
2115 // call would involve propagating the internals.
2116 virtual int32 Properties() const;
2118 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
2119 const CuMatrixBase<BaseFloat> &in,
2120 CuMatrixBase<BaseFloat> *out) const;
2121 virtual void Backprop(const std::string &debug_info,
2122 const ComponentPrecomputedIndexes *indexes,
2123 const CuMatrixBase<BaseFloat> &in_value,
2124 const CuMatrixBase<BaseFloat> &, // out_value
2125 const CuMatrixBase<BaseFloat> &out_deriv,
2126 Component *to_update,
2127 CuMatrixBase<BaseFloat> *in_deriv) const;
2129 // note, we don't implement StoreStats() as it would be inefficient. Instead,
2130 // by default we call StoreStats() on all members that have the flag set,
2131 // inside the Backprop.
2132 virtual void ZeroStats();
2134 virtual void Read(std::istream &is, bool binary);
2135 virtual void Write(std::ostream &os, bool binary) const;
2137 // Don't implement Copy() at this level: implement it in the child class.
2139 // Some functions from base-class UpdatableComponent.
2140 virtual void SetUnderlyingLearningRate(BaseFloat lrate);
2141 virtual void SetActualLearningRate(BaseFloat lrate);
2142 virtual void SetAsGradient();
2143 virtual void Scale(BaseFloat scale);
2144 virtual void Add(BaseFloat alpha, const Component &other);
2145 virtual void PerturbParams(BaseFloat stddev);
2146 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
2147 virtual int32 NumParameters() const;
2148 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
2149 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
2151 // note: we dont implement the StoreStats function as it would be quite
2152 // expensive; instead, by default we call StoreStats() for any components that
2153 // want to store stats, as part of the backprop pass. This is not 100% ideal
2154 // but it will usually do what you want. We can revisit this later if needed.
2156 // Functions to iterate over the internal components
2158 int32 NumComponents() const { return components_.size();}
2159 /// Gets the ith component in this component.
2160 /// The ordering is the same as in the config line. The caller
2161 /// does not own the received component.
2162 const Component* GetComponent(int32 i) const;
2163 /// Sets the ith component. After this call, CompositeComponent owns
2164 /// the reference to the argument component. Frees the previous
2165 /// ith component.
2166 void SetComponent(int32 i, Component *component);
2168 virtual ~CompositeComponent() { DeletePointers(&components_); }
2169 private:
2170 // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2171 // at the output of the i'th component.
2172 inline MatrixStrideType GetStrideType(int32 i) const;
2174 // returns true if at least one of 'components_' returns the kUpdatable flag
2175 // in its flags.
2176 bool IsUpdatable() const;
2178 // the maximum number of
2179 int32 max_rows_process_;
2180 std::vector<Component*> components_;
2182 };
2185 } // namespace nnet3
2186 } // namespace kaldi
2189 #endif