1 // nnet3/nnet-simple-component.h
3 // Copyright 2011-2013 Karel Vesely
4 // 2012-2015 Johns Hopkins University (author: Daniel Povey)
5 // 2013 Xiaohui Zhang
6 // 2014-2015 Vijayaditya Peddinti
7 // 2014-2015 Guoguo Chen
8 // 2015 Daniel Galvez
9 // 2015 Tom Ko
11 // See ../../COPYING for clarification regarding multiple authors
12 //
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 //
17 // http://www.apache.org/licenses/LICENSE-2.0
18 //
19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
22 // MERCHANTABLITY OR NON-INFRINGEMENT.
23 // See the Apache 2 License for the specific language governing permissions and
24 // limitations under the License.
26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
29 #include "nnet3/nnet-common.h"
30 #include "nnet3/nnet-component-itf.h"
31 #include "nnet3/natural-gradient-online.h"
32 #include <iostream>
34 namespace kaldi {
35 namespace nnet3 {
37 /// @file nnet-simple-component.h
38 /// This file contains declarations of components that are "simple", meaning
39 /// they don't care about the indexes they are operating on, produce one
40 /// output for one input, and return the kSimpleComponent flag in their
41 /// Properties(): for example, tanh and affine components. In
42 /// nnet-general-component.h there are components that don't fit this pattern.
44 // This "nnet3" version of the p-norm component only supports the 2-norm.
45 class PnormComponent: public Component {
46 public:
47 void Init(int32 input_dim, int32 output_dim);
48 explicit PnormComponent(int32 input_dim, int32 output_dim) {
49 Init(input_dim, output_dim);
50 }
51 virtual int32 Properties() const {
52 return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
53 }
54 PnormComponent(): input_dim_(0), output_dim_(0) { }
55 virtual std::string Type() const { return "PnormComponent"; }
56 virtual void InitFromConfig(ConfigLine *cfl);
57 virtual int32 InputDim() const { return input_dim_; }
58 virtual int32 OutputDim() const { return output_dim_; }
59 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
60 const CuMatrixBase<BaseFloat> &in,
61 CuMatrixBase<BaseFloat> *out) const;
62 virtual void Backprop(const std::string &debug_info,
63 const ComponentPrecomputedIndexes *indexes,
64 const CuMatrixBase<BaseFloat> &in_value,
65 const CuMatrixBase<BaseFloat> &out_value,
66 const CuMatrixBase<BaseFloat> &out_deriv,
67 void *memo,
68 Component *to_update,
69 CuMatrixBase<BaseFloat> *in_deriv) const;
70 virtual Component* Copy() const { return new PnormComponent(input_dim_,
71 output_dim_); }
73 virtual void Read(std::istream &is, bool binary); // This Read function
74 // requires that the Component has the correct type.
76 /// Write component to stream
77 virtual void Write(std::ostream &os, bool binary) const;
79 protected:
80 int32 input_dim_;
81 int32 output_dim_;
82 };
84 // This component randomly zeros dropout_proportion of the input
85 // and the derivatives are backpropagated through the nonzero inputs.
86 // Typically this component used during training but not in test time.
87 // The idea is described under the name Dropout, in the paper
88 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
89 class DropoutComponent : public RandomComponent {
90 public:
91 void Init(int32 dim, BaseFloat dropout_proportion = 0.0,
92 bool dropout_per_frame = false);
94 DropoutComponent(int32 dim, BaseFloat dropout = 0.0,
95 bool dropout_per_frame = false) {
96 Init(dim, dropout, dropout_per_frame);
97 }
99 DropoutComponent(): dim_(0), dropout_proportion_(0.0),
100 dropout_per_frame_(false) { }
102 virtual int32 Properties() const {
103 return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|
104 kBackpropNeedsOutput|kRandomComponent;
105 }
106 virtual std::string Type() const { return "DropoutComponent"; }
108 virtual void InitFromConfig(ConfigLine *cfl);
110 virtual int32 InputDim() const { return dim_; }
112 virtual int32 OutputDim() const { return dim_; }
114 virtual void Read(std::istream &is, bool binary);
116 // Write component to stream
117 virtual void Write(std::ostream &os, bool binary) const;
119 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
120 const CuMatrixBase<BaseFloat> &in,
121 CuMatrixBase<BaseFloat> *out) const;
122 virtual void Backprop(const std::string &debug_info,
123 const ComponentPrecomputedIndexes *indexes,
124 const CuMatrixBase<BaseFloat> &in_value,
125 const CuMatrixBase<BaseFloat> &out_value,
126 const CuMatrixBase<BaseFloat> &out_deriv,
127 void *memo,
128 Component *to_update,
129 CuMatrixBase<BaseFloat> *in_deriv) const;
130 virtual Component* Copy() const { return new DropoutComponent(dim_,
131 dropout_proportion_,
132 dropout_per_frame_); }
133 virtual std::string Info() const;
135 void SetDropoutProportion(BaseFloat dropout_proportion) {
136 dropout_proportion_ = dropout_proportion;
137 }
139 BaseFloat DropoutProportion() const { return dropout_proportion_; }
140 private:
141 int32 dim_;
142 /// dropout-proportion is the proportion that is dropped out,
143 /// e.g. if 0.1, we set 10% to zero value.
144 BaseFloat dropout_proportion_;
145 bool dropout_per_frame_;
146 };
148 class ElementwiseProductComponent: public Component {
149 public:
150 void Init(int32 input_dim, int32 output_dim);
151 explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
152 Init(input_dim, output_dim);
153 }
154 virtual int32 Properties() const {
155 return kSimpleComponent|kBackpropNeedsInput;
156 }
157 ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
158 virtual std::string Type() const { return "ElementwiseProductComponent"; }
159 virtual void InitFromConfig(ConfigLine *cfl);
160 virtual int32 InputDim() const { return input_dim_; }
161 virtual int32 OutputDim() const { return output_dim_; }
162 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
163 const CuMatrixBase<BaseFloat> &in,
164 CuMatrixBase<BaseFloat> *out) const;
165 virtual void Backprop(const std::string &debug_info,
166 const ComponentPrecomputedIndexes *indexes,
167 const CuMatrixBase<BaseFloat> &in_value,
168 const CuMatrixBase<BaseFloat> &out_value,
169 const CuMatrixBase<BaseFloat> &out_deriv,
170 void *memo,
171 Component *to_update,
172 CuMatrixBase<BaseFloat> *in_deriv) const;
173 virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
174 output_dim_); }
176 virtual void Read(std::istream &is, bool binary); // This Read function
177 // requires that the Component has the correct type.
179 /// Write component to stream
180 virtual void Write(std::ostream &os, bool binary) const;
182 protected:
183 int32 input_dim_;
184 int32 output_dim_;
185 };
187 class NormalizeComponent: public Component {
188 public:
189 void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
190 explicit NormalizeComponent(int32 input_dim,
191 BaseFloat target_rms = 1.0,
192 bool add_log_stddev = false) {
193 Init(input_dim, target_rms, add_log_stddev);
194 }
195 explicit NormalizeComponent(const NormalizeComponent &other);
196 // note: there is some special code in NonlinerComponent::Info() that
197 // specifically caters to this class.
198 virtual int32 Properties() const {
199 return (add_log_stddev_ ?
200 kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
201 kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
202 kBackpropAdds|kBackpropInPlace);
203 }
204 NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
205 virtual std::string Type() const { return "NormalizeComponent"; }
206 virtual void InitFromConfig(ConfigLine *cfl);
207 virtual Component* Copy() const { return new NormalizeComponent(*this); }
208 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
209 const CuMatrixBase<BaseFloat> &in,
210 CuMatrixBase<BaseFloat> *out) const;
211 virtual void Backprop(const std::string &debug_info,
212 const ComponentPrecomputedIndexes *indexes,
213 const CuMatrixBase<BaseFloat> &in_value,
214 const CuMatrixBase<BaseFloat> &, // out_value
215 const CuMatrixBase<BaseFloat> &out_deriv,
216 void *memo,
217 Component *to_update,
218 CuMatrixBase<BaseFloat> *in_deriv) const;
220 virtual void Read(std::istream &is, bool binary);
221 virtual void Write(std::ostream &os, bool binary) const;
222 virtual int32 InputDim() const { return input_dim_; }
223 virtual int32 OutputDim() const {
224 return (input_dim_ + (add_log_stddev_ ? 1 : 0));
225 }
226 virtual std::string Info() const;
227 private:
228 NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
229 enum { kExpSquaredNormFloor = -66 };
230 static const BaseFloat kSquaredNormFloor;
231 int32 input_dim_;
232 BaseFloat target_rms_; // The target rms for outputs.
233 // about 0.7e-20. We need a value that's exactly representable in
234 // float and whose inverse square root is also exactly representable
235 // in float (hence, an even power of two).
237 bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
238 // is an extra dimension of the output.
239 };
242 /*
243 Implements the sigmoid nonlinearity, i.e. the function y = exp(-x).
245 Configuration values accepted:
246 dim Dimension of this component, e.g. 1024
248 Configuration values inherited from NonlinearComponent, and their
249 local meanings:
250 self-repair-lower-threshold e.g. self-repair-lower-threshold=0.05. This
251 controls the self-repair mechanism, which for sigmoid units
252 consists of identifying units which are oversaturated (i.e.
253 usually close to -1 or +1) and nudging the inputs to be
254 closer to zero. It gates on the average derivative of the
255 nonlinearity, which for sigmoid is a value between 0 and
256 0.25. For units where the average function-derivative
257 accumulated during this iteration (job) of training is less
258 than this threshold, we activate self-repair, which consists
259 of adding (-self-repair-scale * (2*the output of the
260 nonlinearity - 1.0)) to the backpropagated derivatives.
261 This just happens to be a convenient-to-compute function
262 that's +1 for large negative inputs, and -1 for large positive
263 inputs, and smooth in between.
264 The default value of this is -1000, which the code internally
265 maps to 0.05 which is suitable for sigmoid units; if you do set it,
266 you can set it to a value like 0.025 or 0.075.
267 self-repair-scale Scale for the self-repair mechanism; see comments above.
268 default=0, but we usually set this to 1.0e-05 (or
269 occasionally 1.0e-04) in the scripts.
271 */
272 class SigmoidComponent: public NonlinearComponent {
273 public:
274 explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
275 SigmoidComponent() { }
276 virtual std::string Type() const { return "SigmoidComponent"; }
277 virtual int32 Properties() const {
278 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
279 }
280 virtual Component* Copy() const { return new SigmoidComponent(*this); }
281 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
282 const CuMatrixBase<BaseFloat> &in,
283 CuMatrixBase<BaseFloat> *out) const;
284 virtual void Backprop(const std::string &debug_info,
285 const ComponentPrecomputedIndexes *indexes,
286 const CuMatrixBase<BaseFloat> &, //in_value
287 const CuMatrixBase<BaseFloat> &out_value,
288 const CuMatrixBase<BaseFloat> &out_deriv,
289 void *memo,
290 Component *to_update,
291 CuMatrixBase<BaseFloat> *in_deriv) const;
292 virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
293 const CuMatrixBase<BaseFloat> &out_value,
294 void *memo);
295 private:
296 // this function is called from Backprop code and only does something if the
297 // self-repair-scale config value is set.
298 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
299 CuMatrixBase<BaseFloat> *in_deriv,
300 SigmoidComponent *to_update) const;
302 SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
303 };
305 /*
306 Implements the tanh nonlinearity, i.e. the function y = tanh(x).
308 Configuration values accepted:
309 dim Dimension of this component, e.g. 1024
311 Configuration values inherited from NonlinearComponent, and their
312 local meanings:
313 self-repair-lower-threshold e.g. self-repair-lower-threshold=0.2. This
314 controls the self-repair mechanism, which for tanh units
315 consists of identifying units which are oversaturated (i.e.
316 usually close to -1 or +1) and nudging the inputs to be
317 closer to zero. It gates on the average derivative of
318 the nonlinearity, which for tanh is a value between 0 and 1.
319 For units where the average function-derivative accumulated
320 during this iteration (job) of training is less than
321 this threshold, we activate self-repair, which consists of
322 adding (-self-repair-scale * the output of the nonlinearity),
323 i.e. (-self-repair-scale * tanh(x)) to the backpropagated
324 derivatives.
325 The default value of this is -1000, which the code internally
326 maps to 0.2 which is suitable for tanh units; if you do set it,
327 you can set it to a value like 0.1 or 0.3.
328 self-repair-scale Scale for the self-repair mechanism; see comments above.
329 default=0, but we usually set this to 1.0e-05 (or
330 occasionally 1.0e-04) in the scripts.
331 */
332 class TanhComponent: public NonlinearComponent {
333 public:
334 explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
335 TanhComponent() { }
336 virtual std::string Type() const { return "TanhComponent"; }
337 virtual Component* Copy() const { return new TanhComponent(*this); }
338 virtual int32 Properties() const {
339 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
340 }
341 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
342 const CuMatrixBase<BaseFloat> &in,
343 CuMatrixBase<BaseFloat> *out) const;
344 virtual void Backprop(const std::string &debug_info,
345 const ComponentPrecomputedIndexes *indexes,
346 const CuMatrixBase<BaseFloat> &, //in_value
347 const CuMatrixBase<BaseFloat> &out_value,
348 const CuMatrixBase<BaseFloat> &out_deriv,
349 void *memo,
350 Component *to_update,
351 CuMatrixBase<BaseFloat> *in_deriv) const;
352 virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
353 const CuMatrixBase<BaseFloat> &out_value,
354 void *memo);
355 private:
356 // this function is called from Backprop code and only does something if the
357 // self-repair-scale config value is set.
358 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
359 CuMatrixBase<BaseFloat> *in_deriv,
360 TanhComponent *to_update) const;
362 TanhComponent &operator = (const TanhComponent &other); // Disallow.
363 };
366 /*
367 Implements the Rectified Linear Unit nonlinearity, a.k.a. ReLU.
369 Configuration values accepted:
370 dim Dimension of this component, e.g. 1024
372 Configuration values inherited from NonlinearComponent, and their
373 local meanings:
374 self-repair-lower-threshold e.g. self-repair-lower-threshold=0.05. (Lower
375 threshold for self-repair, if set; in this case acts on
376 the average function-derivative, which is the proportion
377 of the time the output is > 0. For any unit where the
378 average function-derivative is lower than this threshold,
379 we add 'self-repair-scale' to the backpropagated
380 derivatives in backprop. There is no default
381 (default=-1000, which is interpreted specially).
382 self-repair-upper-threshold e.g. self-repair-upper-threshold=0.95.
383 Like self-repair-lower-threshold, but controls self-repair
384 for units that are active *too* much of the time. Units
385 whose average function-derivative exceeds this threshold
386 will have the negative of 'self-repair-scale' added to their
387 input derivatives in backprop. There is no default
388 (default=-1000, which is interpreted specially).
389 self-repair-scale Scale for the self-repair mechanism; see comments for
390 self-repair-lower-threshold and self-repair-upper-threshold
391 for details. default=0, but we usually set this to 1.0e-05
392 (or occasionally 1.0e-04) in the scripts.
393 */
394 class RectifiedLinearComponent: public NonlinearComponent {
395 public:
396 explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
397 NonlinearComponent(other) { }
398 RectifiedLinearComponent() { }
399 virtual std::string Type() const { return "RectifiedLinearComponent"; }
400 virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
401 virtual int32 Properties() const {
402 return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
403 kStoresStats;
404 }
405 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
406 const CuMatrixBase<BaseFloat> &in,
407 CuMatrixBase<BaseFloat> *out) const;
408 virtual void Backprop(const std::string &debug_info,
409 const ComponentPrecomputedIndexes *indexes,
410 const CuMatrixBase<BaseFloat> &, //in_value
411 const CuMatrixBase<BaseFloat> &out_value,
412 const CuMatrixBase<BaseFloat> &out_deriv,
413 void *memo,
414 Component *to_update,
415 CuMatrixBase<BaseFloat> *in_deriv) const;
416 virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
417 const CuMatrixBase<BaseFloat> &out_value,
418 void *memo);
419 private:
420 // this function is called from Backprop code and only does something if the
421 // self-repair-scale config value is set.
422 void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
423 RectifiedLinearComponent *to_update) const;
425 RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
426 };
429 class FixedAffineComponent;
430 class FixedScaleComponent;
431 class PerElementScaleComponent;
432 class PerElementOffsetComponent;
434 // Affine means a linear function plus an offset.
435 // Note: although this class can be instantiated, it also
436 // functions as a base-class for more specialized versions of
437 // AffineComponent.
438 class AffineComponent: public UpdatableComponent {
439 friend class SoftmaxComponent; // Friend declaration relates to mixing up.
440 public:
442 virtual int32 InputDim() const { return linear_params_.NumCols(); }
443 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
445 virtual std::string Info() const;
446 virtual void InitFromConfig(ConfigLine *cfl);
448 AffineComponent() { } // use Init to really initialize.
449 virtual std::string Type() const { return "AffineComponent"; }
450 virtual int32 Properties() const {
451 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
452 kBackpropNeedsInput|kBackpropAdds;
453 }
456 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
457 const CuMatrixBase<BaseFloat> &in,
458 CuMatrixBase<BaseFloat> *out) const;
459 virtual void Backprop(const std::string &debug_info,
460 const ComponentPrecomputedIndexes *indexes,
461 const CuMatrixBase<BaseFloat> &in_value,
462 const CuMatrixBase<BaseFloat> &, // out_value
463 const CuMatrixBase<BaseFloat> &out_deriv,
464 void *memo,
465 Component *to_update,
466 CuMatrixBase<BaseFloat> *in_deriv) const;
468 virtual void Read(std::istream &is, bool binary);
469 virtual void Write(std::ostream &os, bool binary) const;
471 virtual Component* Copy() const;
474 // Some functions from base-class UpdatableComponent.
475 virtual void Scale(BaseFloat scale);
476 virtual void Add(BaseFloat alpha, const Component &other);
477 virtual void PerturbParams(BaseFloat stddev);
478 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
479 virtual int32 NumParameters() const;
480 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
481 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
483 // Some functions that are specific to this class.
485 virtual void SetParams(const CuVectorBase<BaseFloat> &bias,
486 const CuMatrixBase<BaseFloat> &linear);
487 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
488 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
489 explicit AffineComponent(const AffineComponent &other);
490 // The next constructor is used in converting from nnet1.
491 AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
492 const CuVectorBase<BaseFloat> &bias_params,
493 BaseFloat learning_rate);
494 void Init(int32 input_dim, int32 output_dim,
495 BaseFloat param_stddev, BaseFloat bias_stddev);
496 void Init(std::string matrix_filename);
498 // This function resizes the dimensions of the component, setting the
499 // parameters to zero, while leaving any other configuration values the same.
500 virtual void Resize(int32 input_dim, int32 output_dim);
502 protected:
503 friend class NaturalGradientAffineComponent;
504 // This function Update() is for extensibility; child classes may override
505 // this, e.g. for natural gradient update.
506 virtual void Update(
507 const std::string &debug_info,
508 const CuMatrixBase<BaseFloat> &in_value,
509 const CuMatrixBase<BaseFloat> &out_deriv) {
510 UpdateSimple(in_value, out_deriv);
511 }
512 // UpdateSimple is used when *this is a gradient. Child classes may override
513 // this if needed, but typically won't need to.
514 virtual void UpdateSimple(
515 const CuMatrixBase<BaseFloat> &in_value,
516 const CuMatrixBase<BaseFloat> &out_deriv);
518 const AffineComponent &operator = (const AffineComponent &other); // Disallow.
519 CuMatrix<BaseFloat> linear_params_;
520 CuVector<BaseFloat> bias_params_;
521 };
523 class RepeatedAffineComponent;
525 /// This class implements an affine transform using a block diagonal matrix
526 /// e.g., one whose weight matrix is all zeros except for blocks on the
527 /// diagonal. All these blocks have the same dimensions.
528 /// input-dim: num cols of block diagonal matrix.
529 /// output-dim: num rows of block diagonal matrix.
530 /// num-blocks: number of blocks in diagonal of the matrix.
531 /// num-blocks must divide both input-dim and output-dim
532 class BlockAffineComponent : public UpdatableComponent {
533 public:
534 virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
535 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
537 virtual std::string Info() const;
538 virtual void InitFromConfig(ConfigLine *cfl);
540 BlockAffineComponent() { }
541 virtual std::string Type() const { return "BlockAffineComponent"; }
542 virtual int32 Properties() const {
543 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
544 kBackpropNeedsInput|kBackpropAdds;
545 }
547 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
548 const CuMatrixBase<BaseFloat> &in,
549 CuMatrixBase<BaseFloat> *out) const;
551 virtual void Backprop(const std::string &debug_info,
552 const ComponentPrecomputedIndexes *indexes,
553 const CuMatrixBase<BaseFloat> &in_value,
554 const CuMatrixBase<BaseFloat> &, // out_value
555 const CuMatrixBase<BaseFloat> &out_deriv,
556 void *memo,
557 Component *to_update,
558 CuMatrixBase<BaseFloat> *in_deriv) const;
560 virtual void Read(std::istream &is, bool binary);
561 virtual void Write(std::ostream &os, bool binary) const;
563 virtual Component* Copy() const;
565 // Functions from base-class UpdatableComponent.
566 virtual void Scale(BaseFloat scale);
567 virtual void Add(BaseFloat alpha, const Component &other);
568 virtual void PerturbParams(BaseFloat stddev);
569 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
570 virtual int32 NumParameters() const;
571 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
572 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
574 // BlockAffine-specific functions.
575 void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
576 BaseFloat param_stddev, BaseFloat bias_mean,
577 BaseFloat bias_stddev);
578 explicit BlockAffineComponent(const BlockAffineComponent &other);
579 explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
580 protected:
581 // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
582 // equal size. The blocks are stored in linear_params_ as
583 // [ M
584 // N
585 // O ] but we actually treat it as the matrix:
586 // [ M 0 0
587 // 0 N 0
588 // 0 0 O ]
589 CuMatrix<BaseFloat> linear_params_;
590 CuVector<BaseFloat> bias_params_;
591 int32 num_blocks_;
592 private:
593 const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
594 };
596 class RepeatedAffineComponent: public UpdatableComponent {
597 public:
599 virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
600 virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
602 virtual std::string Info() const;
603 virtual void InitFromConfig(ConfigLine *cfl);
605 RepeatedAffineComponent() { } // use Init to really initialize.
606 virtual std::string Type() const { return "RepeatedAffineComponent"; }
607 virtual int32 Properties() const {
608 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
609 kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
610 }
611 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
612 const CuMatrixBase<BaseFloat> &in,
613 CuMatrixBase<BaseFloat> *out) const;
614 virtual void Backprop(const std::string &debug_info,
615 const ComponentPrecomputedIndexes *indexes,
616 const CuMatrixBase<BaseFloat> &in_value,
617 const CuMatrixBase<BaseFloat> &, // out_value
618 const CuMatrixBase<BaseFloat> &out_deriv,
619 void *memo,
620 Component *to_update,
621 CuMatrixBase<BaseFloat> *in_deriv) const;
623 virtual void Read(std::istream &is, bool binary);
624 virtual void Write(std::ostream &os, bool binary) const;
626 virtual Component* Copy() const;
628 // Some functions from base-class UpdatableComponent.
629 virtual void Scale(BaseFloat scale);
630 virtual void Add(BaseFloat alpha, const Component &other);
631 virtual void PerturbParams(BaseFloat stddev);
632 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
633 virtual int32 NumParameters() const;
634 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
635 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
637 // Some functions that are specific to this class.
638 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
639 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
640 explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
642 void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
643 BaseFloat param_stddev, BaseFloat bias_mean,
644 BaseFloat bias_stddev);
645 friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
646 protected:
647 // This function Update(), called from backprop, is broken out for
648 // extensibility to natural gradient update.
649 virtual void Update(
650 const CuMatrixBase<BaseFloat> &in_value,
651 const CuMatrixBase<BaseFloat> &out_deriv);
653 // This function does nothing here but is redefined in child-class
654 // NaturalGradientRepeatedAffineComponent. This help avoid repeated code.
655 virtual void SetNaturalGradientConfigs() { }
657 const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
658 CuMatrix<BaseFloat> linear_params_;
659 CuVector<BaseFloat> bias_params_;
660 int32 num_repeats_;
661 };
663 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
664 public:
665 // Use Init() to really initialize.
666 NaturalGradientRepeatedAffineComponent() { }
668 // Most of the public functions are inherited from RepeatedAffineComponent.
669 virtual std::string Type() const {
670 return "NaturalGradientRepeatedAffineComponent";
671 }
673 virtual Component* Copy() const;
675 // Copy constructor
676 explicit NaturalGradientRepeatedAffineComponent(
677 const NaturalGradientRepeatedAffineComponent &other);
678 private:
679 virtual void Update(
680 const CuMatrixBase<BaseFloat> &in_value,
681 const CuMatrixBase<BaseFloat> &out_deriv);
683 const NaturalGradientRepeatedAffineComponent &operator=(
684 const NaturalGradientRepeatedAffineComponent &other); // Disallow.
686 // Applies the default configuration to preconditioner_in_.
687 virtual void SetNaturalGradientConfigs();
689 // For efficiency reasons we only apply the natural gradient to the input
690 // side, i.e. not to the space of output derivatives-- we believe the input
691 // side is the more important side. We don't make the natural-gradient
692 // configurable; we just give it a reasonable configuration.
693 // Instead of using the individual data-points, for efficiency reasons we use
694 // the distribution of per-minibatch summed derivatives over each dimension of
695 // the output space, as the source for the Fisher matrix.
696 OnlineNaturalGradient preconditioner_in_;
697 };
699 class SoftmaxComponent: public NonlinearComponent {
700 public:
701 explicit SoftmaxComponent(const SoftmaxComponent &other):
702 NonlinearComponent(other) { }
703 SoftmaxComponent() { }
704 virtual std::string Type() const { return "SoftmaxComponent"; }
705 virtual int32 Properties() const {
706 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
707 }
708 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
709 const CuMatrixBase<BaseFloat> &in,
710 CuMatrixBase<BaseFloat> *out) const;
711 virtual void Backprop(const std::string &debug_info,
712 const ComponentPrecomputedIndexes *indexes,
713 const CuMatrixBase<BaseFloat> &in_value,
714 const CuMatrixBase<BaseFloat> &out_value,
715 const CuMatrixBase<BaseFloat> &out_deriv,
716 void *memo,
717 Component *to_update,
718 CuMatrixBase<BaseFloat> *in_deriv) const;
719 virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
720 const CuMatrixBase<BaseFloat> &out_value,
721 void *memo);
722 virtual Component* Copy() const { return new SoftmaxComponent(*this); }
723 private:
724 SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
725 };
728 /*
729 Implements the log of a softmax nonlinearity, so it's the same
730 as shifting each input vector by a constant offset so that, when
731 exponentiated, it would sum to one.
733 We usually use this in place of softmax because the log-scale
734 output will not saturate.
736 Configuration values accepted:
737 dim e.g. dim=8061. Usually this is the last component
738 in a network, so 'dim' is the number of classes.
739 */
740 class LogSoftmaxComponent: public NonlinearComponent {
741 public:
742 explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
743 NonlinearComponent(other) { }
744 LogSoftmaxComponent() { }
745 virtual std::string Type() const { return "LogSoftmaxComponent"; }
746 virtual int32 Properties() const {
747 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
748 }
749 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
750 const CuMatrixBase<BaseFloat> &in,
751 CuMatrixBase<BaseFloat> *out) const;
752 virtual void Backprop(const std::string &debug_info,
753 const ComponentPrecomputedIndexes *indexes,
754 const CuMatrixBase<BaseFloat> &in_value,
755 const CuMatrixBase<BaseFloat> &out_value,
756 const CuMatrixBase<BaseFloat> &out_deriv,
757 void *memo,
758 Component *to_update,
759 CuMatrixBase<BaseFloat> *in_deriv) const;
761 virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
762 private:
763 LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
764 };
766 /*
767 Keywords: natural gradient descent, NG-SGD, naturalgradient. For
768 the top-level of the natural gradient code look here, and also in
769 nnet-precondition-online.h.
770 NaturalGradientAffineComponent is
771 a version of AffineComponent that has a non-(multiple of unit) learning-rate
772 matrix. See nnet-precondition-online.h for a description of the technique.
773 It is described, under the name Online NG-SGD, in the paper "Parallel
774 training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
775 workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
777 Configuration values accepted by this component:
779 Values inherited from UpdatableComponent (see its declaration in
780 nnet-component-itf for details):
781 learning-rate
782 learning-rate-factor
783 max-change
785 Values used in initializing the component's parameters:
786 input-dim e.g. input-dim=1024. The input dimension.
787 output-dim e.g. output-dim=1024. The output dimension.
788 param-stddev e.g. param-stddev=0.025. The standard deviation
789 used to randomly initialize the linear parameters
790 (as Gaussian random values * param-stddev).
791 Defaults to 1/sqrt(input-dim), which is Glorot
792 initialization.
793 bias-stddev e.g. bias-stddev=0.0. The standard deviation
794 used to randomly initialize the bias parameters.
795 Defaults to 1.0 but we usually set it to 0.0
796 in the config.
797 bias-mean e.g. bias-mean=1.0. Allows you to ininialize the
798 bias parameters with an offset. Default is 0.0
799 which is normally suitable
801 matrix e.g. matrix=foo/bar/init.mat May be used as an
802 alternative to (input-dim, output-dim, param-stddev,
803 bias-stddev, bias-mean) to initialize the parameters.
804 Dimension is output-dim by (input-dim + 1), last
805 column is interpreted as the bias.
807 Options to the natural gradient (you won't normally have to set these,
808 the defaults are suitable):
810 num-samples-history Number of frames used as the time-constant to
811 determine how 'up-to-date' the Fisher-matrix
812 estimates are. Smaller -> more up-to-date, but more
813 noisy. default=2000.
814 alpha Constant that determines how much we smooth the
815 Fisher-matrix estimates with the unit matrix.
816 Larger means more smoothing. default=4.0
817 rank-in Rank used in low-rank-plus-unit estimate of Fisher
818 matrix in the input space. default=20.
819 rank-out Rank used in low-rank-plus-unit estimate of Fisher
820 matrix in the output-derivative space. default=80.
821 update-period Determines after with what frequency (in
822 minibatches) we update the Fisher-matrix estimates;
823 making this > 1 saves a little time in training.
824 default=4.
825 */
826 class NaturalGradientAffineComponent: public AffineComponent {
827 public:
828 virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
829 virtual void Read(std::istream &is, bool binary);
830 virtual void Write(std::ostream &os, bool binary) const;
831 void Init(int32 input_dim, int32 output_dim,
832 BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
833 int32 rank_in, int32 rank_out, int32 update_period,
834 BaseFloat num_samples_history, BaseFloat alpha);
835 void Init(int32 rank_in, int32 rank_out, int32 update_period,
836 BaseFloat num_samples_history,
837 BaseFloat alpha, std::string matrix_filename);
838 // this constructor does not really initialize, use Init() or Read().
839 NaturalGradientAffineComponent();
840 void Resize(int32 input_dim, int32 output_dim);
841 void InitFromConfig(ConfigLine *cfl);
842 virtual std::string Info() const;
843 virtual Component* Copy() const;
844 virtual void Scale(BaseFloat scale);
845 virtual void Add(BaseFloat alpha, const Component &other);
846 virtual void FreezeNaturalGradient(bool freeze);
847 // copy constructor
848 explicit NaturalGradientAffineComponent(
849 const NaturalGradientAffineComponent &other);
850 private:
851 // disallow assignment operator.
852 NaturalGradientAffineComponent &operator= (
853 const NaturalGradientAffineComponent&);
855 // Configs for preconditioner. The input side tends to be better conditioned ->
856 // smaller rank needed, so make them separately configurable.
857 int32 rank_in_;
858 int32 rank_out_;
859 int32 update_period_;
860 BaseFloat num_samples_history_;
861 BaseFloat alpha_;
863 OnlineNaturalGradient preconditioner_in_;
865 OnlineNaturalGradient preconditioner_out_;
867 // Sets the configs rank, alpha and eta in the preconditioner objects,
868 // from the class variables.
869 void SetNaturalGradientConfigs();
871 virtual void Update(
872 const std::string &debug_info,
873 const CuMatrixBase<BaseFloat> &in_value,
874 const CuMatrixBase<BaseFloat> &out_deriv);
875 };
878 /// FixedAffineComponent is an affine transform that is supplied
879 /// at network initialization time and is not trainable.
880 class FixedAffineComponent: public Component {
881 public:
882 FixedAffineComponent() { }
883 virtual std::string Type() const { return "FixedAffineComponent"; }
884 virtual std::string Info() const;
886 // Copy constructor from AffineComponent-- can be used when we're done
887 // training a particular part of the model and want to efficiently disable
888 // further training.
889 FixedAffineComponent(const AffineComponent &c);
891 /// matrix should be of size input-dim+1 to output-dim, last col is offset
892 void Init(const CuMatrixBase<BaseFloat> &matrix);
894 // The ConfigLine cfl contains just the option matrix=<string>,
895 // where the string is the filename of a Kaldi-format matrix to read.
896 virtual void InitFromConfig(ConfigLine *cfl);
898 virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
899 virtual int32 InputDim() const { return linear_params_.NumCols(); }
900 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
902 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
903 const CuMatrixBase<BaseFloat> &in,
904 CuMatrixBase<BaseFloat> *out) const;
905 virtual void Backprop(const std::string &debug_info,
906 const ComponentPrecomputedIndexes *indexes,
907 const CuMatrixBase<BaseFloat> &in_value,
908 const CuMatrixBase<BaseFloat> &, // out_value
909 const CuMatrixBase<BaseFloat> &out_deriv,
910 void *memo,
911 Component *to_update,
912 CuMatrixBase<BaseFloat> *in_deriv) const;
915 virtual Component* Copy() const;
916 virtual void Read(std::istream &is, bool binary);
917 virtual void Write(std::ostream &os, bool binary) const;
919 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
920 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
921 protected:
922 friend class AffineComponent;
923 CuMatrix<BaseFloat> linear_params_;
924 CuVector<BaseFloat> bias_params_;
926 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
927 };
929 /// SumGroupComponent is used to sum up groups of posteriors.
930 /// It's used to introduce a kind of Gaussian-mixture-model-like
931 /// idea into neural nets. This is basically a degenerate case of
932 /// MixtureProbComponent; we had to implement it separately to
933 /// be efficient for CUDA (we can use this one regardless whether
934 /// we have CUDA or not; it's the normal case we want anyway).
935 ///
936 /// There are two forms of initialization in a config file: one
937 /// where the number of elements are specified for each group
938 /// individually as a vector, and one where only the total input
939 /// dimension and the output dimension (number of groups) is specified.
940 /// The second is used when all groups have the same size.
941 class SumGroupComponent: public Component {
942 public:
943 virtual int32 InputDim() const { return input_dim_; }
944 virtual int32 OutputDim() const { return output_dim_; }
945 void Init(const std::vector<int32> &sizes); // the vector is of the input dim
946 // (>= 1) for each output dim.
947 void Init(int32 input_dim, int32 output_dim);
948 void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
949 // each output-dim, how many
950 // inputs were summed over.
951 virtual void InitFromConfig(ConfigLine *cfl);
952 SumGroupComponent() { }
953 virtual std::string Type() const { return "SumGroupComponent"; }
954 virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
955 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
956 const CuMatrixBase<BaseFloat> &in,
957 CuMatrixBase<BaseFloat> *out) const;
958 virtual void Backprop(const std::string &debug_info,
959 const ComponentPrecomputedIndexes *indexes,
960 const CuMatrixBase<BaseFloat> &in_value,
961 const CuMatrixBase<BaseFloat> &, // out_value
962 const CuMatrixBase<BaseFloat> &out_deriv,
963 void *memo,
964 Component *to_update,
965 CuMatrixBase<BaseFloat> *in_deriv) const;
966 virtual Component* Copy() const;
967 virtual void Read(std::istream &is, bool binary);
968 virtual void Write(std::ostream &os, bool binary) const;
970 private:
971 KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
972 // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
973 // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
974 CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
975 // index.
976 CuArray<int32> reverse_indexes_; // for each input index, the output index.
977 int32 input_dim_;
978 int32 output_dim_;
979 };
982 /// FixedScaleComponent applies a fixed per-element scale; it's similar
983 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
984 /// model conversion).
985 class FixedScaleComponent: public Component {
986 public:
987 FixedScaleComponent() { }
988 virtual std::string Type() const { return "FixedScaleComponent"; }
989 virtual std::string Info() const;
990 virtual int32 Properties() const {
991 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
992 }
994 void Init(const CuVectorBase<BaseFloat> &scales);
996 // The ConfigLine cfl contains only the option scales=<string>,
997 // where the string is the filename of a Kaldi-format matrix to read.
998 virtual void InitFromConfig(ConfigLine *cfl);
1000 virtual int32 InputDim() const { return scales_.Dim(); }
1001 virtual int32 OutputDim() const { return scales_.Dim(); }
1003 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1004 const CuMatrixBase<BaseFloat> &in,
1005 CuMatrixBase<BaseFloat> *out) const;
1006 virtual void Backprop(const std::string &debug_info,
1007 const ComponentPrecomputedIndexes *indexes,
1008 const CuMatrixBase<BaseFloat> &, // in_value
1009 const CuMatrixBase<BaseFloat> &, // out_value
1010 const CuMatrixBase<BaseFloat> &out_deriv,
1011 void *memo,
1012 Component *, // to_update
1013 CuMatrixBase<BaseFloat> *in_deriv) const;
1014 virtual Component* Copy() const;
1015 virtual void Read(std::istream &is, bool binary);
1016 virtual void Write(std::ostream &os, bool binary) const;
1018 const CuVector<BaseFloat> &Scales() const { return scales_; }
1019 protected:
1020 CuVector<BaseFloat> scales_;
1021 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
1022 };
1025 /// FixedBiasComponent applies a fixed per-element bias; it's similar
1026 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
1027 /// model conversion.
1028 class FixedBiasComponent: public Component {
1029 public:
1030 FixedBiasComponent() { }
1031 virtual std::string Type() const { return "FixedBiasComponent"; }
1032 virtual std::string Info() const;
1034 virtual int32 Properties() const {
1035 return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
1036 }
1038 void Init(const CuVectorBase<BaseFloat> &scales);
1040 // The ConfigLine cfl contains only the option bias=<string>,
1041 // where the string is the filename of a Kaldi-format matrix to read.
1042 virtual void InitFromConfig(ConfigLine *cfl);
1043 virtual int32 InputDim() const { return bias_.Dim(); }
1044 virtual int32 OutputDim() const { return bias_.Dim(); }
1045 using Component::Propagate; // to avoid name hiding
1046 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1047 const CuMatrixBase<BaseFloat> &in,
1048 CuMatrixBase<BaseFloat> *out) const;
1049 virtual void Backprop(const std::string &debug_info,
1050 const ComponentPrecomputedIndexes *indexes,
1051 const CuMatrixBase<BaseFloat> &, // in_value,
1052 const CuMatrixBase<BaseFloat> &, // out_value
1053 const CuMatrixBase<BaseFloat> &out_deriv,
1054 void *memo,
1055 Component *, // to_update
1056 CuMatrixBase<BaseFloat> *in_deriv) const;
1057 virtual Component* Copy() const;
1058 virtual void Read(std::istream &is, bool binary);
1059 virtual void Write(std::ostream &os, bool binary) const;
1061 protected:
1062 CuVector<BaseFloat> bias_;
1063 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
1064 };
1066 /** NoOpComponent just duplicates its input. We don't anticipate this being used
1067 very often, but it may sometimes make your life easier
1068 The only config parameter it accepts is 'dim', e.g. 'dim=400'.
1069 */
1070 class NoOpComponent: public NonlinearComponent {
1071 public:
1072 explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
1073 NoOpComponent() { }
1074 virtual std::string Type() const { return "NoOpComponent"; }
1075 virtual int32 Properties() const {
1076 return kSimpleComponent|kLinearInInput|kPropagateInPlace;
1077 }
1078 virtual Component* Copy() const { return new NoOpComponent(*this); }
1079 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1080 const CuMatrixBase<BaseFloat> &in,
1081 CuMatrixBase<BaseFloat> *out) const;
1082 virtual void Backprop(const std::string &debug_info,
1083 const ComponentPrecomputedIndexes *indexes,
1084 const CuMatrixBase<BaseFloat> &, //in_value
1085 const CuMatrixBase<BaseFloat> &, // out_value,
1086 const CuMatrixBase<BaseFloat> &out_deriv,
1087 void *memo,
1088 Component *to_update,
1089 CuMatrixBase<BaseFloat> *in_deriv) const;
1090 private:
1091 NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
1092 };
1094 /** SumBlockComponent sums over blocks of its input: for instance, if
1095 you create one with the config "input-dim=400 output-dim=100",
1096 its output will be the sum over the 4 100-dimensional blocks of
1097 the input.
1099 The "scale" config parameter may be used if you want to do averaging
1100 instead of summing, e.g. "input-dim=400 output-dim=100 scale=0.25"
1101 will accomplish averaging.
1103 Accepted values on its config-file line are:
1104 input-dim The input dimension. Required.
1105 output-dim The block dimension. Required. Must divide input-dim.
1106 scale A scaling factor on the output. Defaults to 1.0.
1107 */
1108 class SumBlockComponent: public Component {
1109 public:
1110 explicit SumBlockComponent(const SumBlockComponent &other);
1111 SumBlockComponent() { }
1112 virtual std::string Type() const { return "SumBlockComponent"; }
1113 virtual int32 Properties() const {
1114 return kSimpleComponent|kLinearInInput|kPropagateAdds|kBackpropAdds;
1115 }
1116 virtual void InitFromConfig(ConfigLine *cfl);
1117 virtual int32 InputDim() const { return input_dim_; }
1118 virtual int32 OutputDim() const { return output_dim_; }
1119 virtual void Read(std::istream &is, bool binary);
1120 virtual void Write(std::ostream &os, bool binary) const;
1121 virtual std::string Info() const;
1122 virtual Component* Copy() const { return new SumBlockComponent(*this); }
1123 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1124 const CuMatrixBase<BaseFloat> &in,
1125 CuMatrixBase<BaseFloat> *out) const;
1126 virtual void Backprop(const std::string &debug_info,
1127 const ComponentPrecomputedIndexes *indexes,
1128 const CuMatrixBase<BaseFloat> &, //in_value
1129 const CuMatrixBase<BaseFloat> &, // out_value,
1130 const CuMatrixBase<BaseFloat> &out_deriv,
1131 void *memo,
1132 Component *to_update,
1133 CuMatrixBase<BaseFloat> *in_deriv) const;
1134 private:
1135 int32 input_dim_;
1136 int32 output_dim_;
1137 BaseFloat scale_;
1138 SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow.
1139 };
1142 // ClipGradientComponent just duplicates its input, but clips gradients
1143 // during backpropagation if they cross a predetermined threshold.
1144 // This component will be used to prevent gradient explosion problem in
1145 // recurrent neural networks
1146 class ClipGradientComponent: public Component {
1147 public:
1148 ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1149 bool norm_based_clipping,
1150 BaseFloat self_repair_clipped_proportion_threshold,
1151 BaseFloat self_repair_target,
1152 BaseFloat self_repair_scale,
1153 int32 num_clipped,
1154 int32 count,
1155 int32 num_self_repaired,
1156 int32 num_backpropped) {
1157 Init(dim, clipping_threshold, norm_based_clipping,
1158 self_repair_clipped_proportion_threshold,
1159 self_repair_target,
1160 self_repair_scale,
1161 num_clipped, count,
1162 num_self_repaired, num_backpropped);}
1164 ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1165 norm_based_clipping_(false),
1166 self_repair_clipped_proportion_threshold_(1.0),
1167 self_repair_target_(0.0),
1168 self_repair_scale_(0.0),
1169 num_clipped_(0), count_(0),
1170 num_self_repaired_(0), num_backpropped_(0) { }
1172 virtual int32 InputDim() const { return dim_; }
1173 virtual int32 OutputDim() const { return dim_; }
1174 virtual void InitFromConfig(ConfigLine *cfl);
1175 void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1176 BaseFloat self_repair_clipped_proportion_threshold,
1177 BaseFloat self_repair_target,
1178 BaseFloat self_repair_scale,
1179 int32 num_clipped, int32 count,
1180 int32 num_self_repaired, int32 num_backpropped);
1182 virtual std::string Type() const { return "ClipGradientComponent"; }
1184 virtual int32 Properties() const {
1185 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1186 kBackpropNeedsInput;
1187 }
1189 virtual void ZeroStats();
1191 virtual Component* Copy() const {
1192 return new ClipGradientComponent(dim_,
1193 clipping_threshold_,
1194 norm_based_clipping_,
1195 self_repair_clipped_proportion_threshold_,
1196 self_repair_target_,
1197 self_repair_scale_,
1198 num_clipped_,
1199 count_,
1200 num_self_repaired_,
1201 num_backpropped_);}
1203 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1204 const CuMatrixBase<BaseFloat> &in,
1205 CuMatrixBase<BaseFloat> *out) const;
1206 virtual void Backprop(const std::string &debug_info,
1207 const ComponentPrecomputedIndexes *indexes,
1208 const CuMatrixBase<BaseFloat> &in_value,
1209 const CuMatrixBase<BaseFloat> &, // out_value,
1210 const CuMatrixBase<BaseFloat> &out_deriv,
1211 void *memo,
1212 Component *to_update,
1213 CuMatrixBase<BaseFloat> *in_deriv) const;
1215 virtual void Scale(BaseFloat scale);
1216 virtual void Add(BaseFloat alpha, const Component &other);
1217 virtual void Read(std::istream &is, bool binary); // This Read function
1218 // requires that the Component has the correct type.
1219 /// Write component to stream
1220 virtual void Write(std::ostream &os, bool binary) const;
1221 virtual std::string Info() const;
1222 virtual ~ClipGradientComponent() {
1223 if (num_self_repaired_ > 0)
1224 KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1225 << ")'s self-repair was activated " << num_self_repaired_
1226 << " time(s) out of " << num_backpropped_
1227 << " times of calling Backprop() in this training job.";
1228 }
1229 private:
1230 int32 dim_; // input/output dimension
1231 BaseFloat clipping_threshold_; // threshold to be used for clipping
1232 // could correspond to max-row-norm (if
1233 // norm_based_clipping_ == true) or
1234 // max-absolute-value (otherwise)
1235 bool norm_based_clipping_; // if true the max-row-norm will be clipped
1236 // else element-wise absolute value clipping is
1237 // done
1239 // some configuration values relating to self-repairing.
1240 BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1241 // clipped-proportion
1242 // for self-repair to be
1243 // activated
1244 BaseFloat self_repair_target_; // the target value towards which self-repair
1245 // is trying to set for in-deriv
1246 BaseFloat self_repair_scale_; // constant scaling the self-repair vector
1247 std::string debug_info_; // component-node name, used in the destructor to
1248 // print out stats of self-repair
1250 // this function is called from Backprop code, and only does something if the
1251 // self-repair-scale config value is set and the current clipped proportion
1252 // exceeds the threshold. What it does is to add a term to in-deriv that
1253 // forces the input to the ClipGradientComponent to be close to some small
1254 // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1255 // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1256 // small, the parameters on the path will also tend to be small, which may
1257 // help tamp down the divergence caused by gradient explosion.
1258 void RepairGradients(const std::string &debug_info,
1259 const CuMatrixBase<BaseFloat> &in_value,
1260 CuMatrixBase<BaseFloat> *in_deriv,
1261 ClipGradientComponent *to_update) const;
1263 ClipGradientComponent &operator =
1264 (const ClipGradientComponent &other); // Disallow.
1266 protected:
1267 // variables to store stats
1268 // An element corresponds to rows of derivative matrix, when
1269 // norm_based_clipping_ is true,
1270 // else it corresponds to each element of the derivative matrix
1271 // Note: no stats are stored when norm_based_clipping_ is false
1272 int32 num_clipped_; // number of elements which were clipped
1273 int32 count_; // number of elements which were processed
1274 int32 num_self_repaired_; // number of times self-repair is activated
1275 int32 num_backpropped_; //number of times backprop is called
1277 };
1279 /** PermuteComponent changes the order of the columns (i.e. the feature or
1280 activation dimensions). Output dimension i is mapped to input dimension
1281 column_map_[i], so it's like doing:
1282 for each row:
1283 for each feature/activation dimension i:
1284 output(row, i) = input(row, column_map_[i]).
1286 */
1287 class PermuteComponent: public Component {
1288 public:
1289 PermuteComponent() {}
1290 PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1292 virtual int32 InputDim() const { return column_map_.Dim(); }
1293 virtual int32 OutputDim() const { return column_map_.Dim(); }
1294 virtual void InitFromConfig(ConfigLine *cfl);
1295 void Init(const std::vector<int32> &column_map);
1297 virtual std::string Type() const { return "PermuteComponent"; }
1299 virtual int32 Properties() const {
1300 return kSimpleComponent|kLinearInInput;
1301 }
1303 virtual void ZeroStats() {}
1305 virtual Component* Copy() const;
1307 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1308 const CuMatrixBase<BaseFloat> &in,
1309 CuMatrixBase<BaseFloat> *out) const;
1310 virtual void Backprop(const std::string &debug_info,
1311 const ComponentPrecomputedIndexes *indexes,
1312 const CuMatrixBase<BaseFloat> &, //in_value
1313 const CuMatrixBase<BaseFloat> &, // out_value,
1314 const CuMatrixBase<BaseFloat> &out_deriv,
1315 void *memo,
1316 Component *to_update,
1317 CuMatrixBase<BaseFloat> *in_deriv) const;
1319 virtual void Scale(BaseFloat scale) {}
1320 virtual void Add(BaseFloat alpha, const Component &other) {}
1321 virtual void Read(std::istream &is, bool binary); // This Read function
1322 // requires that the Component has the correct type.
1323 /// Write component to stream
1324 virtual void Write(std::ostream &os, bool binary) const;
1325 virtual std::string Info() const;
1326 private:
1327 // computes the reverse column map. Must not be called if column_map_.Dim()
1328 // == 0
1329 void ComputeReverseColumnMap();
1330 CuArray<int32> column_map_;
1331 // the following is a derived variable, not written to disk.
1332 // It is used in backprop.
1333 CuArray<int32> reverse_column_map_;
1334 PermuteComponent &operator =
1335 (const PermuteComponent &other); // Disallow.
1336 };
1341 // PerElementScaleComponent scales each dimension of its input with a separate
1342 // trainable scale; it's like a linear component with a diagonal matrix.
1343 class PerElementScaleComponent: public UpdatableComponent {
1344 public:
1345 virtual int32 InputDim() const { return scales_.Dim(); }
1346 virtual int32 OutputDim() const { return scales_.Dim(); }
1348 virtual std::string Info() const;
1349 virtual void InitFromConfig(ConfigLine *cfl);
1351 PerElementScaleComponent() { } // use Init to really initialize.
1352 virtual std::string Type() const { return "PerElementScaleComponent"; }
1353 virtual int32 Properties() const {
1354 return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1355 kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1356 }
1358 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1359 const CuMatrixBase<BaseFloat> &in,
1360 CuMatrixBase<BaseFloat> *out) const;
1361 virtual void Backprop(const std::string &debug_info,
1362 const ComponentPrecomputedIndexes *indexes,
1363 const CuMatrixBase<BaseFloat> &in_value,
1364 const CuMatrixBase<BaseFloat> &, // out_value
1365 const CuMatrixBase<BaseFloat> &out_deriv,
1366 void *memo,
1367 Component *to_update,
1368 CuMatrixBase<BaseFloat> *in_deriv) const;
1370 virtual void Read(std::istream &is, bool binary);
1371 virtual void Write(std::ostream &os, bool binary) const;
1373 virtual Component* Copy() const;
1376 // Some functions from base-class UpdatableComponent.
1377 virtual void Scale(BaseFloat scale);
1378 virtual void Add(BaseFloat alpha, const Component &other);
1379 virtual void PerturbParams(BaseFloat stddev);
1380 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1381 virtual int32 NumParameters() const;
1382 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1383 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1385 // Some functions that are specific to this class.
1386 explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1388 void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1389 void Init(std::string vector_filename);
1391 protected:
1392 // This function Update() is for extensibility; child classes may override
1393 // this, e.g. for natural gradient update.
1394 virtual void Update(
1395 const std::string &debug_info,
1396 const CuMatrixBase<BaseFloat> &in_value,
1397 const CuMatrixBase<BaseFloat> &out_deriv) {
1398 UpdateSimple(in_value, out_deriv);
1399 }
1400 // UpdateSimple is used when *this is a gradient. Child classes may override
1401 // this if needed, but typically won't need to.
1402 virtual void UpdateSimple(
1403 const CuMatrixBase<BaseFloat> &in_value,
1404 const CuMatrixBase<BaseFloat> &out_deriv);
1406 const PerElementScaleComponent &operator
1407 = (const PerElementScaleComponent &other); // Disallow.
1408 CuVector<BaseFloat> scales_;
1409 };
1411 /*
1412 PerElementOffsetComponent offsets each dimension of its input with a separate
1413 trainable bias; it's like an affine component with fixed weight matrix which
1414 is always equal to I.
1416 Accepted values on its config line, with defaults if applicable.
1418 vector If specified, the offsets will be read from this file ('vector'
1419 is interpreted as an rxfilename).
1421 dim If 'vector' is not specified, you should specify the
1422 dimension 'dim', and will be randomly initialized according
1423 to 'param-mean' and 'param-stddev'.
1424 param-mean=0.0 Mean of randomly initialized offset parameters.
1425 param-stddev=0.0 Standard deviation of randomly initialized offset parameters.
1427 */
1428 class PerElementOffsetComponent: public UpdatableComponent {
1429 public:
1430 virtual int32 InputDim() const { return offsets_.Dim(); }
1431 virtual int32 OutputDim() const { return offsets_.Dim(); }
1433 virtual std::string Info() const;
1434 virtual void InitFromConfig(ConfigLine *cfl);
1436 PerElementOffsetComponent() { } // use Init to really initialize.
1437 virtual std::string Type() const { return "PerElementOffsetComponent"; }
1438 virtual int32 Properties() const {
1439 return kSimpleComponent|kUpdatableComponent|
1440 kBackpropInPlace|kPropagateInPlace;
1441 }
1443 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1444 const CuMatrixBase<BaseFloat> &in,
1445 CuMatrixBase<BaseFloat> *out) const;
1446 virtual void Backprop(const std::string &debug_info,
1447 const ComponentPrecomputedIndexes *indexes,
1448 const CuMatrixBase<BaseFloat> &, // in_value
1449 const CuMatrixBase<BaseFloat> &, // out_value
1450 const CuMatrixBase<BaseFloat> &out_deriv,
1451 void *memo,
1452 Component *to_update,
1453 CuMatrixBase<BaseFloat> *in_deriv) const;
1455 virtual void Read(std::istream &is, bool binary);
1456 virtual void Write(std::ostream &os, bool binary) const;
1458 virtual Component* Copy() const;
1461 // Some functions from base-class UpdatableComponent.
1462 virtual void Scale(BaseFloat scale);
1463 virtual void Add(BaseFloat alpha, const Component &other);
1464 virtual void PerturbParams(BaseFloat stddev);
1465 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1466 virtual int32 NumParameters() const;
1467 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1468 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1470 // Some functions that are specific to this class.
1471 explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1473 void Init(int32 dim, BaseFloat param_mean,
1474 BaseFloat param_stddev);
1475 void Init(std::string vector_filename);
1477 protected:
1478 const PerElementOffsetComponent &operator
1479 = (const PerElementOffsetComponent &other); // Disallow.
1480 CuVector<BaseFloat> offsets_;
1481 };
1484 // ConstantFunctionComponent returns constant function of its input,
1485 // i.e. its output does not depend on its input. It is the same as
1486 // an affine component with the linear term fixed at zero.
1487 // It is optionally trainable, and optionally you can use natural
1488 // gradient. The input is required only because it's more convenient
1489 // to make SimpleComponents [but see ConstantComponent, which requires
1490 // no inputs].
1491 class ConstantFunctionComponent: public UpdatableComponent {
1492 public:
1493 virtual int32 InputDim() const { return input_dim_; }
1494 virtual int32 OutputDim() const { return output_.Dim(); }
1496 virtual std::string Info() const;
1497 // possible parameter values with their defaults:
1498 // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1499 // output-mean=0 output-stddev=0
1500 virtual void InitFromConfig(ConfigLine *cfl);
1502 ConstantFunctionComponent();
1504 ConstantFunctionComponent(const ConstantFunctionComponent &other);
1506 virtual std::string Type() const { return "ConstantFunctionComponent"; }
1507 virtual int32 Properties() const {
1508 return kSimpleComponent|
1509 (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1510 (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1511 kBackpropAdds;
1512 }
1513 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1514 const CuMatrixBase<BaseFloat> &in,
1515 CuMatrixBase<BaseFloat> *out) const;
1516 virtual void Backprop(const std::string &debug_info,
1517 const ComponentPrecomputedIndexes *indexes,
1518 const CuMatrixBase<BaseFloat> &, // in_value
1519 const CuMatrixBase<BaseFloat> &, // out_value
1520 const CuMatrixBase<BaseFloat> &out_deriv,
1521 void *memo,
1522 Component *to_update,
1523 CuMatrixBase<BaseFloat> *in_deriv) const;
1525 virtual void Read(std::istream &is, bool binary);
1526 virtual void Write(std::ostream &os, bool binary) const;
1528 virtual Component* Copy() const;
1530 // Some functions from base-class UpdatableComponent.
1531 virtual void Scale(BaseFloat scale);
1532 virtual void Add(BaseFloat alpha, const Component &other);
1533 virtual void PerturbParams(BaseFloat stddev);
1534 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1535 virtual int32 NumParameters() const;
1536 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1537 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1538 private:
1539 int32 input_dim_;
1540 // the output value-- a vector.
1541 CuVector<BaseFloat> output_;
1543 bool is_updatable_;
1544 // if true, and if updatable, do natural-gradient update.
1545 bool use_natural_gradient_;
1546 OnlineNaturalGradient preconditioner_;
1548 const ConstantFunctionComponent &operator
1549 = (const ConstantFunctionComponent &other); // Disallow.
1550 };
1554 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1555 // it uses a natural gradient update for the per-element scales, and enforces a
1556 // maximum amount of change per minibatch, for stability.
1557 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1558 public:
1560 virtual std::string Info() const;
1562 virtual void InitFromConfig(ConfigLine *cfl);
1564 NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1565 virtual std::string Type() const {
1566 return "NaturalGradientPerElementScaleComponent";
1567 }
1569 virtual void Read(std::istream &is, bool binary);
1570 virtual void Write(std::ostream &os, bool binary) const;
1571 virtual void FreezeNaturalGradient(bool freeze);
1573 virtual Component* Copy() const;
1575 // Some functions that are specific to this class:
1576 explicit NaturalGradientPerElementScaleComponent(
1577 const NaturalGradientPerElementScaleComponent &other);
1579 void Init(int32 dim, BaseFloat param_mean,
1580 BaseFloat param_stddev, int32 rank, int32 update_period,
1581 BaseFloat num_samples_history, BaseFloat alpha);
1582 void Init(std::string vector_filename,
1583 int32 rank, int32 update_period, BaseFloat num_samples_history,
1584 BaseFloat alpha);
1586 private:
1587 // unlike the NaturalGradientAffineComponent, there is only one dimension to
1588 // consider as the parameters are a vector not a matrix, so we only need one
1589 // preconditioner.
1590 // The preconditioner stores its own configuration values; we write and read
1591 // these, but not the preconditioner object itself.
1592 OnlineNaturalGradient preconditioner_;
1594 // Override of the parent-class Update() function, called only
1595 // if this->is_gradient_ = false; this implements the natural
1596 // gradient update.
1597 virtual void Update(
1598 const std::string &debug_info,
1599 const CuMatrixBase<BaseFloat> &in_value,
1600 const CuMatrixBase<BaseFloat> &out_deriv);
1602 const NaturalGradientPerElementScaleComponent &operator
1603 = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1604 };
1606 /**
1607 * WARNING, this component is deprecated in favor of
1608 * TimeHeightConvolutionComponent, and will be deleted.
1609 * ConvolutionalComponent implements 2d-convolution.
1610 * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1611 * 2 dimensions as it has same size as the input along the 3rd dimension.
1612 * Input : A matrix where each row is a vectorized 3D-tensor.
1613 * The 3D tensor has dimensions
1614 * x: (e.g. time)
1615 * y: (e.g. frequency)
1616 * z: (e.g. channels like features/delta/delta-delta)
1617 *
1618 * The component supports input vectorizations of type zyx and yzx.
1619 * The default vectorization type is zyx.
1620 * e.g. for input vectorization of type zyx the input is vectorized by
1621 * spanning axes z, y and x of the tensor in that order.
1622 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1623 * the zyx vectorized input looks like
1624 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1625 *
1626 *
1627 * Output : The output is also a 3D tensor vectorized in the zyx format.
1628 * The channel axis (z) in the output corresponds to the output of
1629 * different filters. The first channel corresponds to the first filter
1630 * i.e., first row of the filter_params_ matrix.
1631 *
1632 * Note: The component has to support yzx input vectorization as the binaries
1633 * like add-deltas generate yz vectorized output. These input vectors are
1634 * concatenated using the Append descriptor across time steps to form a yzx
1635 * vectorized 3D tensor input.
1636 * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1637 *
1638 *
1639 * For information on the hyperparameters and parameters of this component see
1640 * the variable declarations.
1641 *
1642 * Propagation:
1643 * ------------
1644 * Convolution operation consists of a dot-products between the filter tensor
1645 * and input tensor patch, for various shifts of filter tensor along the x and y
1646 * axes input tensor. (Note: there is no shift along z-axis as the filter and
1647 * input tensor have same size along this axis).
1648 *
1649 * For a particular shift (i,j) of the filter tensor
1650 * along input tensor dimensions x and y, the elements of the input tensor which
1651 * overlap with the filter form the input tensor patch. This patch is vectorized
1652 * in zyx format. All the patches corresponding to various samples in the
1653 * mini-batch are stacked into a matrix, where each row corresponds to one
1654 * patch. Let this matrix be represented by X_{i,j}. The dot products with
1655 * various filters are computed simultaneously by computing the matrix product
1656 * with the filter_params_ matrix (W)
1657 * Y_{i,j} = X_{i,j}*W^T.
1658 * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1659 *
1660 * All the matrix products corresponding to various shifts (i,j) of the
1661 * filter tensor are computed simultaneously using the AddMatMatBatched
1662 * call of CuMatrixBase class.
1663 *
1664 * BackPropagation:
1665 * ----------------
1666 * Backpropagation to compute the input derivative (\nabla X_{i,j})
1667 * consists of the a series of matrix products.
1668 * \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1669 * output derivative for a particular shift of the filter.
1670 *
1671 * Once again these matrix products are computed simultaneously.
1672 *
1673 * Update:
1674 * -------
1675 * The weight gradient is computed as
1676 * \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1677 *
1678 */
1679 class ConvolutionComponent: public UpdatableComponent {
1680 public:
1681 enum TensorVectorizationType {
1682 kYzx = 0,
1683 kZyx = 1
1684 };
1686 ConvolutionComponent();
1687 // constructor using another component
1688 ConvolutionComponent(const ConvolutionComponent &component);
1689 // constructor using parameters
1690 ConvolutionComponent(
1691 const CuMatrixBase<BaseFloat> &filter_params,
1692 const CuVectorBase<BaseFloat> &bias_params,
1693 int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1694 int32 filt_x_dim, int32 filt_y_dim,
1695 int32 filt_x_step, int32 filt_y_step,
1696 TensorVectorizationType input_vectorization,
1697 BaseFloat learning_rate);
1699 virtual int32 InputDim() const;
1700 virtual int32 OutputDim() const;
1702 virtual std::string Info() const;
1703 virtual void InitFromConfig(ConfigLine *cfl);
1704 virtual std::string Type() const { return "ConvolutionComponent"; }
1705 virtual int32 Properties() const {
1706 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1707 kBackpropAdds|kPropagateAdds;
1708 }
1710 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1711 const CuMatrixBase<BaseFloat> &in,
1712 CuMatrixBase<BaseFloat> *out) const;
1713 virtual void Backprop(const std::string &debug_info,
1714 const ComponentPrecomputedIndexes *indexes,
1715 const CuMatrixBase<BaseFloat> &in_value,
1716 const CuMatrixBase<BaseFloat> &, // out_value,
1717 const CuMatrixBase<BaseFloat> &out_deriv,
1718 void *memo,
1719 Component *to_update_in,
1720 CuMatrixBase<BaseFloat> *in_deriv) const;
1721 void Update(const std::string &debug_info,
1722 const CuMatrixBase<BaseFloat> &in_value,
1723 const CuMatrixBase<BaseFloat> &out_deriv,
1724 const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1727 virtual void Read(std::istream &is, bool binary);
1728 virtual void Write(std::ostream &os, bool binary) const;
1730 virtual Component* Copy() const;
1732 // Some functions from base-class UpdatableComponent.
1733 virtual void Scale(BaseFloat scale);
1734 virtual void Add(BaseFloat alpha, const Component &other);
1735 virtual void PerturbParams(BaseFloat stddev);
1736 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1737 virtual int32 NumParameters() const;
1738 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1739 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1741 // Some functions that are specific to this class.
1742 void SetParams(const VectorBase<BaseFloat> &bias,
1743 const MatrixBase<BaseFloat> &filter);
1744 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1745 const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1746 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1747 int32 filt_x_dim, int32 filt_y_dim,
1748 int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1749 TensorVectorizationType input_vectorization,
1750 BaseFloat param_stddev, BaseFloat bias_stddev);
1751 // there is no filt_z_dim parameter as the length of the filter along
1752 // z-dimension is same as the input
1753 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1754 int32 filt_x_dim, int32 filt_y_dim,
1755 int32 filt_x_step, int32 filt_y_step,
1756 TensorVectorizationType input_vectorization,
1757 std::string matrix_filename);
1759 // resize the component, setting the parameters to zero, while
1760 // leaving any other configuration values the same
1761 void Resize(int32 input_dim, int32 output_dim);
1763 void Update(const std::string &debug_info,
1764 const CuMatrixBase<BaseFloat> &in_value,
1765 const CuMatrixBase<BaseFloat> &out_deriv);
1768 private:
1769 int32 input_x_dim_; // size of the input along x-axis
1770 // (e.g. number of time steps)
1772 int32 input_y_dim_; // size of input along y-axis
1773 // (e.g. number of mel-frequency bins)
1775 int32 input_z_dim_; // size of input along z-axis
1776 // (e.g. number of channels is 3 if the input has
1777 // features + delta + delta-delta features
1779 int32 filt_x_dim_; // size of the filter along x-axis
1781 int32 filt_y_dim_; // size of the filter along y-axis
1783 // there is no filt_z_dim_ as it is always assumed to be
1784 // the same as input_z_dim_
1786 int32 filt_x_step_; // the number of steps taken along x-axis of input
1787 // before computing the next dot-product
1788 // of filter and input
1790 int32 filt_y_step_; // the number of steps taken along y-axis of input
1791 // before computing the next dot-product of the filter
1792 // and input
1794 // there is no filt_z_step_ as only dot product is possible along this axis
1796 TensorVectorizationType input_vectorization_; // type of vectorization of the
1797 // input 3D tensor. Accepts zyx and yzx formats
1799 CuMatrix<BaseFloat> filter_params_;
1800 // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1801 // where each row in the matrix corresponds to one filter.
1802 // The 3D filter tensor is vectorizedin zyx format.
1803 // The first row of the matrix corresponds to the first filter and so on.
1804 // Keep in mind the vectorization type and order of filters when using file
1805 // based initialization.
1807 CuVector<BaseFloat> bias_params_;
1808 // the filter-specific bias vector (i.e., there is a seperate bias added
1809 // to the output of each filter).
1810 bool is_gradient_;
1812 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1813 CuMatrix<BaseFloat> *patches) const;
1814 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1815 CuMatrixBase<BaseFloat> *in_deriv) const;
1816 const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1817 };
1820 /*
1821 LstmNonlinearityComponent is a component that implements part of an LSTM, by
1822 combining together the sigmoids and tanh's, plus some diagonal terms, into
1823 a single block.
1824 We will refer to the LSTM formulation used in
1826 Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1827 by H. Sak et al,
1828 http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1830 Suppose the cell dimension is C. Then outside this component, we compute
1831 the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1832 matrix multiplication:
1834 i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1835 f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1836 c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1837 o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1839 The part of the computation that takes place in this component is as follows.
1840 Its input is of dimension 5C [however, search for 'dropout' below],
1841 consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}). Its
1842 output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1844 To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1846 This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1847 and w_o.
1850 In the forward pass (Propagate), this component computes the following:
1852 i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) (1)
1853 f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) (2)
1854 c_t = f_t*c_{t-1} + i_t * Tanh(c_part) (3)
1855 o_t = Sigmoid(o_part + w_{oc}*c_t) (4)
1856 m_t = o_t * Tanh(c_t) (5)
1857 # note: the outputs are just c_t and m_t.
1859 [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead
1860 of 5C in this case, the last three input dimensions will be interpreted as
1861 per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of
1862 (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.]
1864 The backprop is as you would think, but for the "self-repair" we need to pass
1865 in additional vectors (of the same dim as the parameters of the layer) that
1866 dictate whether or not we add an additional term to the backpropagated
1867 derivatives. (This term helps force the input to the nonlinearities into the
1868 range where the derivatives are not too small).
1870 This component stores stats of the same form as are normally stored by the
1871 StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1872 activations and derivatives, but this is done inside the Backprop() functions.
1873 [the StoreStats() functions don't take the input data as an argument, so
1874 storing this data that way is impossible, and anyway it's more efficient to
1875 do it as part of backprop.]
1877 Configuration values accepted:
1878 cell-dim e.g. cell-dim=1024 Cell dimension. The input
1879 dimension of this component is cell-dim * 5, and the
1880 output dimension is cell-dim * 2. Note: this
1881 component implements only part of the LSTM layer,
1882 see comments above.
1883 param-stddev Standard deviation for random initialization of
1884 the diagonal matrices (AKA peephole connections).
1885 default=1.0, which is probably too high but
1886 we couldn't see any reliable gain from decreasing it.
1887 tanh-self-repair-threshold Equivalent to the self-repair-lower-threshold
1888 in a TanhComponent; applies to both the tanh nonlinearities.
1889 default=0.2, you probably won't want to changethis.
1890 sigmoid-self-repair-threshold Equivalent to self-repair-lower-threshold
1891 in a SigmoidComponent; applies to all three of the sigmoid
1892 nonlinearities. default=0.05, you probably won't want to
1893 change this.
1894 self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
1895 or TanhComponent; applies to both the sigmoid and tanh
1896 nonlinearities. default=1.0e-05, which you probably won't
1897 want to change unless dealing with an objective function
1898 that has smaller or larger dynamic range than normal, in
1899 which case you might want to make it smaller or larger.
1900 */
1901 class LstmNonlinearityComponent: public UpdatableComponent {
1902 public:
1904 virtual int32 InputDim() const;
1905 virtual int32 OutputDim() const;
1906 virtual std::string Info() const;
1907 virtual void InitFromConfig(ConfigLine *cfl);
1908 LstmNonlinearityComponent(): use_dropout_(false) { }
1909 virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1910 virtual int32 Properties() const {
1911 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1912 }
1914 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
1915 const CuMatrixBase<BaseFloat> &in,
1916 CuMatrixBase<BaseFloat> *out) const;
1917 virtual void Backprop(const std::string &debug_info,
1918 const ComponentPrecomputedIndexes *indexes,
1919 const CuMatrixBase<BaseFloat> &in_value,
1920 const CuMatrixBase<BaseFloat> &, // out_value,
1921 const CuMatrixBase<BaseFloat> &out_deriv,
1922 void *memo,
1923 Component *to_update_in,
1924 CuMatrixBase<BaseFloat> *in_deriv) const;
1926 virtual void Read(std::istream &is, bool binary);
1927 virtual void Write(std::ostream &os, bool binary) const;
1929 virtual Component* Copy() const;
1931 // Some functions from base-class UpdatableComponent.
1932 virtual void Scale(BaseFloat scale);
1933 virtual void Add(BaseFloat alpha, const Component &other);
1934 virtual void PerturbParams(BaseFloat stddev);
1935 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1936 virtual int32 NumParameters() const;
1937 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1938 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1939 virtual void ZeroStats();
1940 virtual void FreezeNaturalGradient(bool freeze);
1942 // Some functions that are specific to this class:
1943 explicit LstmNonlinearityComponent(
1944 const LstmNonlinearityComponent &other);
1946 void Init(int32 cell_dim, bool use_dropout,
1947 BaseFloat param_stddev,
1948 BaseFloat tanh_self_repair_threshold,
1949 BaseFloat sigmoid_self_repair_threshold,
1950 BaseFloat self_repair_scale);
1952 private:
1954 // Initializes the natural-gradient object with the configuration we
1955 // use for this object, which for now is hardcoded at the C++ level.
1956 void InitNaturalGradient();
1958 // Notation: C is the cell dimension; it equals params_.NumCols().
1960 // The dimension of the parameter matrix is (3 x C);
1961 // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1962 CuMatrix<BaseFloat> params_;
1964 // If true, we expect an extra 2 dimensions on the input, for dropout masks
1965 // for i_t and f_t.
1966 bool use_dropout_;
1968 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1969 // equations (1) through (5), this is the sum of the values of the nonliearities
1970 // (used for diagnostics only). It is comparable to value_sum_ vector
1971 // in base-class NonlinearComponent.
1972 CuMatrix<double> value_sum_;
1974 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1975 // equations (1) through (5), this is the sum of the derivatives of the
1976 // nonliearities (used for diagnostics and to control self-repair). It is
1977 // comparable to the deriv_sum_ vector in base-class
1978 // NonlinearComponent.
1979 CuMatrix<double> deriv_sum_;
1981 // This matrix has dimension 10. The contents are a block of 5 self-repair
1982 // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1983 // self-repair scales (typically all 0.00001). These are for each of the 5
1984 // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1985 // more info).
1986 CuVector<BaseFloat> self_repair_config_;
1988 // This matrix has dimension 5. For each of the 5 nonlinearities in the LSTM
1989 // component (see comments in cu-math.h for more info), it contains the total,
1990 // over all frames represented in count_, of the number of dimensions that
1991 // were subject to self_repair. To get the self-repair proportion you should
1992 // divide by (count_ times cell_dim_).
1993 CuVector<double> self_repair_total_;
1995 // The total count (number of frames) corresponding to the stats in value_sum_
1996 // and deriv_sum_.
1997 double count_;
1999 // Preconditioner for the parameters of this component [operates in the space
2000 // of dimension C].
2001 // The preconditioner stores its own configuration values; we write and read
2002 // these, but not the preconditioner object itself.
2003 OnlineNaturalGradient preconditioner_;
2005 const LstmNonlinearityComponent &operator
2006 = (const LstmNonlinearityComponent &other); // Disallow.
2007 };
2012 /*
2013 * WARNING, this component is deprecated as it's not compatible with
2014 * TimeHeightConvolutionComponent, and it will eventually be deleted.
2015 * MaxPoolingComponent :
2016 * Maxpooling component was firstly used in ConvNet for selecting an
2017 * representative activation in an area. It inspired Maxout nonlinearity.
2018 * Each output element of this component is the maximum of a block of
2019 * input elements where the block has a 3D dimension (pool_x_size_,
2020 * pool_y_size_, pool_z_size_).
2021 * Blocks could overlap if the shift value on any axis is smaller
2022 * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
2023 * If the shift values are euqal to their pool size, there is no
2024 * overlap; while if they all equal 1, the blocks overlap to
2025 * the greatest possible extent.
2026 *
2027 * This component is designed to be used after a ConvolutionComponent
2028 * so that the input matrix is propagated from a 2d-convolutional layer.
2029 * This component implements 3d-maxpooling which performs
2030 * max pooling along the three axes.
2031 * Input : A matrix where each row is a vectorized 3D-tensor.
2032 * The 3D tensor has dimensions
2033 * x: (e.g. time)
2034 * y: (e.g. frequency)
2035 * z: (e.g. channels like number of filters in the ConvolutionComponent)
2036 *
2037 * The component assumes input vectorizations of type zyx
2038 * which is the default output vectorization type of a ConvolutionComponent.
2039 * e.g. for input vectorization of type zyx the input is vectorized by
2040 * spanning axes z, y and x of the tensor in that order.
2041 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
2042 * the zyx vectorized input looks like
2043 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
2044 *
2045 * Output : The output is also a 3D tensor vectorized in the zyx format.
2046 *
2047 * For information on the hyperparameters and parameters of this component see
2048 * the variable declarations.
2049 *
2050 *
2051 */
2052 class MaxpoolingComponent: public Component {
2053 public:
2055 MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
2056 pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
2057 pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
2058 // constructor using another component
2059 MaxpoolingComponent(const MaxpoolingComponent &component);
2061 virtual int32 InputDim() const;
2062 virtual int32 OutputDim() const;
2064 virtual std::string Info() const;
2065 virtual void InitFromConfig(ConfigLine *cfl);
2066 virtual std::string Type() const { return "MaxpoolingComponent"; }
2067 virtual int32 Properties() const {
2068 return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
2069 kBackpropAdds;
2070 }
2072 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
2073 const CuMatrixBase<BaseFloat> &in,
2074 CuMatrixBase<BaseFloat> *out) const;
2075 virtual void Backprop(const std::string &debug_info,
2076 const ComponentPrecomputedIndexes *indexes,
2077 const CuMatrixBase<BaseFloat> &in_value,
2078 const CuMatrixBase<BaseFloat> &out_value,
2079 const CuMatrixBase<BaseFloat> &out_deriv,
2080 void *memo,
2081 Component *, // to_update,
2082 CuMatrixBase<BaseFloat> *in_deriv) const;
2084 virtual void Read(std::istream &is, bool binary); // This Read function
2085 // requires that the Component has the correct type.
2087 /// Write component to stream
2088 virtual void Write(std::ostream &os, bool binary) const;
2089 virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
2092 protected:
2093 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
2094 CuMatrix<BaseFloat> *patches) const;
2095 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
2096 CuMatrixBase<BaseFloat> *in_deriv) const;
2097 virtual void Check() const;
2100 int32 input_x_dim_; // size of the input along x-axis
2101 // (e.g. number of time steps)
2102 int32 input_y_dim_; // size of input along y-axis
2103 // (e.g. number of mel-frequency bins)
2104 int32 input_z_dim_; // size of input along z-axis
2105 // (e.g. number of filters in the ConvolutionComponent)
2107 int32 pool_x_size_; // size of the pooling window along x-axis
2108 int32 pool_y_size_; // size of the pooling window along y-axis
2109 int32 pool_z_size_; // size of the pooling window along z-axis
2111 int32 pool_x_step_; // the number of steps taken along x-axis of input
2112 // before computing the next pool
2113 int32 pool_y_step_; // the number of steps taken along y-axis of input
2114 // before computing the next pool
2115 int32 pool_z_step_; // the number of steps taken along z-axis of input
2116 // before computing the next pool
2118 };
2121 /*
2122 BatchNormComponent
2124 This implements batch normalization; for each dimension of the
2125 input it normalizes the data to be zero-mean, unit-variance. You
2126 can set the block-dim configuration value to implement spatial
2127 batch normalization, see the comment for the variable.
2129 It's a simple component (uses the kSimpleComponent flag), but it is unusual in
2130 that it will give different results if you call it on half the matrix at a
2131 time. Most of the time this would be pretty harmless, so we still return the
2132 kSimpleComponent flag. We may have to modify the test code a little to
2133 account for this, or possibly remove the kSimpleComponent flag. In some sense
2134 each output Index depends on every input Index, but putting those dependencies
2135 explicitly into the dependency-tracking framework as a GeneralComponent
2136 would be very impractical and might lead to a lot of unnecessary things being
2137 computed. You have to be a bit careful where you put this component, and understand
2138 what you're doing e.g. putting it in the path of a recurrence is a bit problematic
2139 if the minibatch size were small.
2140 */
2141 class BatchNormComponent: public Component {
2142 public:
2144 BatchNormComponent(): dim_(0), block_dim_(0),
2145 epsilon_(1.0e-03), target_rms_(1.0),
2146 test_mode_(false), count_(0) { }
2148 // call this with 'true' to set 'test mode' where the batch normalization is
2149 // done with stored stats. There won't normally be any need to specially
2150 // accumulate these stats; they are stored as a matter of course on each
2151 // iteration of training, as for NonlinearComponents, and we'll use the stats
2152 // from the most recent [script-level] iteration.
2153 void SetTestMode(bool test_mode);
2155 // constructor using another component
2156 BatchNormComponent(const BatchNormComponent &other);
2158 virtual int32 InputDim() const { return dim_; }
2159 virtual int32 OutputDim() const { return dim_; }
2161 virtual std::string Info() const;
2162 // supports the config variables dim, block-dim (which defaults to dim),
2163 // epsilon (which defaults to 1.0e-3), and target-rms (which defaults to 1.0,
2164 // and is a scaling on the output; it's comparable to the target-rms of
2165 // NormalizeComponent). it also accepts a boolean 'test-mode' config which is
2166 // only intended for use in testing code, and not in real situations. (note:
2167 // test-mode is a real thing that's used during 'inference' given a previously
2168 // computed model, and we do set test mode in real situations; we just don't
2169 // do so from the config, we use the function SetTestMode().
2170 virtual void InitFromConfig(ConfigLine *cfl);
2171 virtual std::string Type() const { return "BatchNormComponent"; }
2172 virtual int32 Properties() const {
2173 // If the block-dim is less than the dim, we need the input and output
2174 // matrices to be contiguous (stride==num-cols), as we'll be reshaping
2175 // internally. This is not much of a cost, because this will be used
2176 // in convnets where we have to do this anyway.
2177 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|
2178 kBackpropInPlace|
2179 (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
2180 (test_mode_ ? 0 : kUsesMemo|kStoresStats);
2181 }
2182 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
2183 const CuMatrixBase<BaseFloat> &in,
2184 CuMatrixBase<BaseFloat> *out) const;
2185 virtual void Backprop(const std::string &debug_info,
2186 const ComponentPrecomputedIndexes *indexes,
2187 const CuMatrixBase<BaseFloat> &in_value,
2188 const CuMatrixBase<BaseFloat> &out_value,
2189 const CuMatrixBase<BaseFloat> &out_deriv,
2190 void *memo,
2191 Component *, // to_update,
2192 CuMatrixBase<BaseFloat> *in_deriv) const;
2194 virtual void Read(std::istream &is, bool binary); // This Read function
2195 // requires that the Component has the correct type.
2197 /// Write component to stream
2198 virtual void Write(std::ostream &os, bool binary) const;
2199 virtual Component* Copy() const { return new BatchNormComponent(*this); }
2201 virtual void Scale(BaseFloat scale);
2202 virtual void Add(BaseFloat alpha, const Component &other);
2203 virtual void ZeroStats();
2206 virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
2208 virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
2209 const CuMatrixBase<BaseFloat> &out_value,
2210 void *memo);
2212 // Members specific to this component type.
2213 // Note: the offset and scale will only be nonempty in 'test mode'.
2214 const CuVector<BaseFloat> &Offset() const { return offset_; }
2215 const CuVector<BaseFloat> &Scale() const { return scale_; }
2217 private:
2219 struct Memo {
2220 // number of frames (after any reshaping).
2221 int32 num_frames;
2222 // 'sum_sumsq_scale' is of dimension 4 by block_dim_:
2223 // Row 0 = mean = the mean of the rows of the input
2224 // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
2225 // Row 2 = scale = the scale of the renormalization, which is
2226 // Row 3 is used as a temporary in Backprop.
2227 // the inverse stddev of the input (modified by epsilon_,
2228 // see the Propagate function.
2229 CuMatrix<BaseFloat> mean_uvar_scale;
2230 };
2232 void Check() const;
2234 // this function is used in a couple of places; it turns the raw stats into
2235 // the offset/scale term of a normalizing transform.
2236 static void ComputeOffsetAndScale(double count,
2237 BaseFloat epsilon,
2238 const Vector<double> &stats_sum,
2239 const Vector<double> &stats_sumsq,
2240 Vector<BaseFloat> *offset,
2241 Vector<BaseFloat> *scale);
2242 // computes derived parameters offset_ and scale_.
2243 void ComputeDerived();
2245 // Dimension of the input and output.
2246 int32 dim_;
2247 // This would normally be the same as dim_, but if it's less (and it must be >
2248 // 0 and must divide dim_), then each separate block of the input of dimension
2249 // 'block_dim_' is treated like a separate frame for the purposes of
2250 // normalization. This can be used to implement spatial batch normalization
2251 // for convolutional setups-- assuming the filter-dim has stride 1, which it
2252 // always will in the new code in nnet-convolutional-component.h, when it's
2253 // finished.
2254 int32 block_dim_;
2256 // Used to avoid exact-zero variances, epsilon has the dimension of a
2257 // covariance; in this work it is applied as a floor, not as an additive term
2258 // (this is safer in the presence of numerical roundoff).
2259 BaseFloat epsilon_;
2261 // This value will normally be 1.0, which is the default, but you can set it
2262 // to other values as a way to control how fast the following layer learns
2263 // (smaller -> slower). The same config exists in NormalizeComponent.
2264 BaseFloat target_rms_;
2266 // This is true if we want the batch normalization to operate in 'test mode'
2267 // meaning the data mean and stddev used for the normalziation are fixed
2268 // quantities based on previously accumulated stats. Note: the stats we use
2269 // for this are based on the same 'StoreStats' mechanism as we use for
2270 // components like SigmoidComponent and ReluComponent; we'll be using
2271 // the stats from the most recent [script-level] iteration of training.
2272 bool test_mode_;
2275 // total count of stats stored by StoreStats().
2276 double count_;
2277 // sum-of-data component of stats of input data.
2278 CuVector<double> stats_sum_;
2279 // sum-of-squared component of stats of input data.
2280 CuVector<double> stats_sumsq_;
2282 // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they
2283 // dictate the transform that is done in 'test mode'. They are set only when
2284 // reading the model from disk and when calling SetTestMode(true); they are
2285 // resized to empty when the stats are updated, to ensure that out-of-date
2286 // values are not kept around.
2287 CuVector<BaseFloat> offset_;
2288 CuVector<BaseFloat> scale_;
2289 };
2293 /**
2294 CompositeComponent is a component representing a sequence of
2295 [simple] components. The config line would be something like the following
2296 (imagine this is all on one line):
2298 component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
2299 component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
2300 component2='type=RectifiedLinearComponent dim=10000' \
2301 component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
2303 The reason you might want to use this component, instead of directly using
2304 the same sequence of components in the config file, is to save GPU memory (at
2305 the expense of more compute)-- because doing it like this means we have to
2306 re-do parts of the forward pass in the backprop phase, but we avoid using
2307 much memory for very long (and you can make the memory usage very small by
2308 making max-rows-process small). We inherit from UpdatableComponent just in
2309 case one or more of the components in the sequence are updatable.
2311 It is an error to nest a CompositeComponent inside a CompositeComponent.
2312 The same effect can be accomplished by specifying a smaller max-rows-process
2313 in a single CompositeComponent.
2314 */
2315 class CompositeComponent: public UpdatableComponent {
2316 public:
2317 virtual int32 InputDim() const;
2318 virtual int32 OutputDim() const;
2320 virtual std::string Info() const;
2322 virtual void InitFromConfig(ConfigLine *cfl);
2324 virtual Component* Copy() const;
2326 CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
2328 // Initialize from this list of components; takes ownership of the pointers.
2329 void Init(const std::vector<Component*> &components,
2330 int32 max_rows_process);
2332 virtual std::string Type() const { return "CompositeComponent"; }
2334 // The properties depend on the properties of the constituent components. As
2335 // a special case, we never return kStoresStats in the properties: by default
2336 // we store things like activation stats (e.g. for nonlinear components like
2337 // ReLU) as part of the backprop. This means we may wastefully store stats
2338 // even when not requested, but it does save time as a separate StoreStats()
2339 // call would involve propagating the internals.
2340 virtual int32 Properties() const;
2342 virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
2343 const CuMatrixBase<BaseFloat> &in,
2344 CuMatrixBase<BaseFloat> *out) const;
2345 virtual void Backprop(const std::string &debug_info,
2346 const ComponentPrecomputedIndexes *indexes,
2347 const CuMatrixBase<BaseFloat> &in_value,
2348 const CuMatrixBase<BaseFloat> &, // out_value
2349 const CuMatrixBase<BaseFloat> &out_deriv,
2350 void *memo,
2351 Component *to_update,
2352 CuMatrixBase<BaseFloat> *in_deriv) const;
2354 // note, we don't implement StoreStats() as it would be inefficient. Instead,
2355 // by default we call StoreStats() on all members that have the flag set,
2356 // inside the Backprop.
2357 virtual void ZeroStats();
2359 virtual void Read(std::istream &is, bool binary);
2360 virtual void Write(std::ostream &os, bool binary) const;
2362 // Don't implement Copy() at this level: implement it in the child class.
2364 // Some functions from base-class UpdatableComponent.
2365 virtual void SetUnderlyingLearningRate(BaseFloat lrate);
2366 virtual void SetActualLearningRate(BaseFloat lrate);
2367 virtual void SetAsGradient();
2368 virtual void Scale(BaseFloat scale);
2369 virtual void Add(BaseFloat alpha, const Component &other);
2370 virtual void PerturbParams(BaseFloat stddev);
2371 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
2372 virtual int32 NumParameters() const;
2373 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
2374 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
2375 virtual void FreezeNaturalGradient(bool freeze);
2377 // note: we dont implement the StoreStats function as it would be quite
2378 // expensive; instead, by default we call StoreStats() for any components that
2379 // want to store stats, as part of the backprop pass. This is not 100% ideal
2380 // but it will usually do what you want. We can revisit this later if needed.
2382 // Functions to iterate over the internal components
2384 int32 NumComponents() const { return components_.size();}
2385 /// Gets the ith component in this component.
2386 /// The ordering is the same as in the config line. The caller
2387 /// does not own the received component.
2388 const Component* GetComponent(int32 i) const;
2389 /// Sets the ith component. After this call, CompositeComponent owns
2390 /// the reference to the argument component. Frees the previous
2391 /// ith component.
2392 void SetComponent(int32 i, Component *component);
2394 virtual ~CompositeComponent() { DeletePointers(&components_); }
2395 private:
2396 // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2397 // at the output of the i'th component.
2398 inline MatrixStrideType GetStrideType(int32 i) const;
2400 // returns true if at least one of 'components_' returns the kUpdatable flag
2401 // in its flags.
2402 bool IsUpdatable() const;
2404 // the maximum number of
2405 int32 max_rows_process_;
2406 std::vector<Component*> components_;
2408 };
2411 } // namespace nnet3
2412 } // namespace kaldi
2415 #endif