1 // nnet3/nnet-simple-component.h
3 // Copyright 2011-2013 Karel Vesely
4 // 2012-2015 Johns Hopkins University (author: Daniel Povey)
5 // 2013 Xiaohui Zhang
6 // 2014-2015 Vijayaditya Peddinti
7 // 2014-2015 Guoguo Chen
8 // 2015 Daniel Galvez
9 // 2015 Tom Ko
11 // See ../../COPYING for clarification regarding multiple authors
12 //
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 //
17 // http://www.apache.org/licenses/LICENSE-2.0
18 //
19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
22 // MERCHANTABLITY OR NON-INFRINGEMENT.
23 // See the Apache 2 License for the specific language governing permissions and
24 // limitations under the License.
26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
29 #include "nnet3/nnet-common.h"
30 #include "nnet3/nnet-component-itf.h"
31 #include "nnet3/natural-gradient-online.h"
32 #include <iostream>
34 namespace kaldi {
35 namespace nnet3 {
37 /// @file nnet-simple-component.h
38 /// This file contains declarations of components that are "simple", meaning
39 /// they don't care about the indexes they are operating on, produce one
40 /// output for one input, and return the kSimpleComponent flag in their
41 /// Properties(): for example, tanh and affine components. In
42 /// nnet-general-component.h there are components that don't fit this pattern.
44 // This "nnet3" version of the p-norm component only supports the 2-norm.
45 class PnormComponent: public Component {
46 public:
47 void Init(int32 input_dim, int32 output_dim);
48 explicit PnormComponent(int32 input_dim, int32 output_dim) {
49 Init(input_dim, output_dim);
50 }
51 virtual int32 Properties() const {
52 return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
53 }
54 PnormComponent(): input_dim_(0), output_dim_(0) { }
55 virtual std::string Type() const { return "PnormComponent"; }
56 virtual void InitFromConfig(ConfigLine *cfl);
57 virtual int32 InputDim() const { return input_dim_; }
58 virtual int32 OutputDim() const { return output_dim_; }
59 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
60 const CuMatrixBase<BaseFloat> &in,
61 CuMatrixBase<BaseFloat> *out) const;
62 virtual void Backprop(const std::string &debug_info,
63 const ComponentPrecomputedIndexes *indexes,
64 const CuMatrixBase<BaseFloat> &in_value,
65 const CuMatrixBase<BaseFloat> &out_value,
66 const CuMatrixBase<BaseFloat> &out_deriv,
67 Component *to_update,
68 CuMatrixBase<BaseFloat> *in_deriv) const;
69 virtual Component* Copy() const { return new PnormComponent(input_dim_,
70 output_dim_); }
72 virtual void Read(std::istream &is, bool binary); // This Read function
73 // requires that the Component has the correct type.
75 /// Write component to stream
76 virtual void Write(std::ostream &os, bool binary) const;
78 protected:
79 int32 input_dim_;
80 int32 output_dim_;
81 };
83 // This component randomly zeros dropout_proportion of the input
84 // and the derivatives are backpropagated through the nonzero inputs.
85 // Typically this component used during training but not in test time.
86 // The idea is described under the name Dropout, in the paper
87 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
88 class DropoutComponent : public RandomComponent {
89 public:
90 void Init(int32 dim, BaseFloat dropout_proportion = 0.0,
91 bool dropout_per_frame = false);
93 DropoutComponent(int32 dim, BaseFloat dropout = 0.0,
94 bool dropout_per_frame = false) {
95 Init(dim, dropout, dropout_per_frame);
96 }
98 DropoutComponent(): dim_(0), dropout_proportion_(0.0),
99 dropout_per_frame_(false) { }
101 virtual int32 Properties() const {
102 return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
103 }
104 virtual std::string Type() const { return "DropoutComponent"; }
106 virtual void InitFromConfig(ConfigLine *cfl);
108 virtual int32 InputDim() const { return dim_; }
110 virtual int32 OutputDim() const { return dim_; }
112 virtual void Read(std::istream &is, bool binary);
114 // Write component to stream
115 virtual void Write(std::ostream &os, bool binary) const;
117 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
118 const CuMatrixBase<BaseFloat> &in,
119 CuMatrixBase<BaseFloat> *out) const;
120 virtual void Backprop(const std::string &debug_info,
121 const ComponentPrecomputedIndexes *indexes,
122 const CuMatrixBase<BaseFloat> &in_value,
123 const CuMatrixBase<BaseFloat> &out_value,
124 const CuMatrixBase<BaseFloat> &out_deriv,
125 Component *to_update,
126 CuMatrixBase<BaseFloat> *in_deriv) const;
127 virtual Component* Copy() const { return new DropoutComponent(dim_,
128 dropout_proportion_,
129 dropout_per_frame_); }
130 virtual std::string Info() const;
132 void SetDropoutProportion(BaseFloat dropout_proportion) {
133 dropout_proportion_ = dropout_proportion;
134 }
136 private:
137 int32 dim_;
138 /// dropout-proportion is the proportion that is dropped out,
139 /// e.g. if 0.1, we set 10% to zero value.
140 BaseFloat dropout_proportion_;
141 bool dropout_per_frame_;
142 };
144 class ElementwiseProductComponent: public Component {
145 public:
146 void Init(int32 input_dim, int32 output_dim);
147 explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
148 Init(input_dim, output_dim);
149 }
150 virtual int32 Properties() const {
151 return kSimpleComponent|kBackpropNeedsInput;
152 }
153 ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
154 virtual std::string Type() const { return "ElementwiseProductComponent"; }
155 virtual void InitFromConfig(ConfigLine *cfl);
156 virtual int32 InputDim() const { return input_dim_; }
157 virtual int32 OutputDim() const { return output_dim_; }
158 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
159 const CuMatrixBase<BaseFloat> &in,
160 CuMatrixBase<BaseFloat> *out) const;
161 virtual void Backprop(const std::string &debug_info,
162 const ComponentPrecomputedIndexes *indexes,
163 const CuMatrixBase<BaseFloat> &in_value,
164 const CuMatrixBase<BaseFloat> &out_value,
165 const CuMatrixBase<BaseFloat> &out_deriv,
166 Component *to_update,
167 CuMatrixBase<BaseFloat> *in_deriv) const;
168 virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
169 output_dim_); }
171 virtual void Read(std::istream &is, bool binary); // This Read function
172 // requires that the Component has the correct type.
174 /// Write component to stream
175 virtual void Write(std::ostream &os, bool binary) const;
177 protected:
178 int32 input_dim_;
179 int32 output_dim_;
180 };
182 class NormalizeComponent: public Component {
183 public:
184 void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
185 explicit NormalizeComponent(int32 input_dim,
186 BaseFloat target_rms = 1.0,
187 bool add_log_stddev = false) {
188 Init(input_dim, target_rms, add_log_stddev);
189 }
190 explicit NormalizeComponent(const NormalizeComponent &other);
191 // note: there is some special code in NonlinerComponent::Info() that
192 // specifically caters to this class.
193 virtual int32 Properties() const {
194 return (add_log_stddev_ ?
195 kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
196 kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
197 kBackpropAdds|kBackpropInPlace);
198 }
199 NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
200 virtual std::string Type() const { return "NormalizeComponent"; }
201 virtual void InitFromConfig(ConfigLine *cfl);
202 virtual Component* Copy() const { return new NormalizeComponent(*this); }
203 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
204 const CuMatrixBase<BaseFloat> &in,
205 CuMatrixBase<BaseFloat> *out) const;
206 virtual void Backprop(const std::string &debug_info,
207 const ComponentPrecomputedIndexes *indexes,
208 const CuMatrixBase<BaseFloat> &in_value,
209 const CuMatrixBase<BaseFloat> &, // out_value
210 const CuMatrixBase<BaseFloat> &out_deriv,
211 Component *to_update,
212 CuMatrixBase<BaseFloat> *in_deriv) const;
214 virtual void Read(std::istream &is, bool binary);
215 virtual void Write(std::ostream &os, bool binary) const;
216 virtual int32 InputDim() const { return input_dim_; }
217 virtual int32 OutputDim() const {
218 return (input_dim_ + (add_log_stddev_ ? 1 : 0));
219 }
220 virtual std::string Info() const;
221 private:
222 NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
223 enum { kExpSquaredNormFloor = -66 };
224 static const BaseFloat kSquaredNormFloor;
225 int32 input_dim_;
226 BaseFloat target_rms_; // The target rms for outputs.
227 // about 0.7e-20. We need a value that's exactly representable in
228 // float and whose inverse square root is also exactly representable
229 // in float (hence, an even power of two).
231 bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
232 // is an extra dimension of the output.
233 };
236 class SigmoidComponent: public NonlinearComponent {
237 public:
238 explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
239 SigmoidComponent() { }
240 virtual std::string Type() const { return "SigmoidComponent"; }
241 virtual int32 Properties() const {
242 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
243 }
244 virtual Component* Copy() const { return new SigmoidComponent(*this); }
245 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
246 const CuMatrixBase<BaseFloat> &in,
247 CuMatrixBase<BaseFloat> *out) const;
248 virtual void Backprop(const std::string &debug_info,
249 const ComponentPrecomputedIndexes *indexes,
250 const CuMatrixBase<BaseFloat> &, //in_value
251 const CuMatrixBase<BaseFloat> &out_value,
252 const CuMatrixBase<BaseFloat> &out_deriv,
253 Component *to_update,
254 CuMatrixBase<BaseFloat> *in_deriv) const;
255 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
256 private:
257 // this function is called from Backprop code and only does something if the
258 // self-repair-scale config value is set.
259 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
260 CuMatrixBase<BaseFloat> *in_deriv,
261 SigmoidComponent *to_update) const;
263 SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
264 };
266 class TanhComponent: public NonlinearComponent {
267 public:
268 explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
269 TanhComponent() { }
270 virtual std::string Type() const { return "TanhComponent"; }
271 virtual Component* Copy() const { return new TanhComponent(*this); }
272 virtual int32 Properties() const {
273 return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
274 }
275 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
276 const CuMatrixBase<BaseFloat> &in,
277 CuMatrixBase<BaseFloat> *out) const;
278 virtual void Backprop(const std::string &debug_info,
279 const ComponentPrecomputedIndexes *indexes,
280 const CuMatrixBase<BaseFloat> &, //in_value
281 const CuMatrixBase<BaseFloat> &out_value,
282 const CuMatrixBase<BaseFloat> &out_deriv,
283 Component *to_update,
284 CuMatrixBase<BaseFloat> *in_deriv) const;
285 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
286 private:
287 // this function is called from Backprop code and only does something if the
288 // self-repair-scale config value is set.
289 void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
290 CuMatrixBase<BaseFloat> *in_deriv,
291 TanhComponent *to_update) const;
293 TanhComponent &operator = (const TanhComponent &other); // Disallow.
294 };
297 class RectifiedLinearComponent: public NonlinearComponent {
298 public:
299 explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
300 NonlinearComponent(other) { }
301 RectifiedLinearComponent() { }
302 virtual std::string Type() const { return "RectifiedLinearComponent"; }
303 virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
304 virtual int32 Properties() const {
305 return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
306 kStoresStats;
307 }
308 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
309 const CuMatrixBase<BaseFloat> &in,
310 CuMatrixBase<BaseFloat> *out) const;
311 virtual void Backprop(const std::string &debug_info,
312 const ComponentPrecomputedIndexes *indexes,
313 const CuMatrixBase<BaseFloat> &, //in_value
314 const CuMatrixBase<BaseFloat> &out_value,
315 const CuMatrixBase<BaseFloat> &out_deriv,
316 Component *to_update,
317 CuMatrixBase<BaseFloat> *in_deriv) const;
318 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
320 private:
321 // this function is called from Backprop code and only does something if the
322 // self-repair-scale config value is set.
323 void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
324 RectifiedLinearComponent *to_update) const;
326 RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
327 };
329 /**
330 This component is a fixed (non-trainable) nonlinearity that sums its inputs
331 to produce outputs. Currently the only supported configuration is that its
332 input-dim is interpreted as consisting of n blocks, and the output is just a
333 summation over the n blocks, where n = input-dim / output-dim, so for instance
334 output[n] = input[n] + input[block-size + n] + .... .
335 Later if needed we can add a configuration variable that allows you to sum
336 over 'interleaved' input.
337 */
338 class SumReduceComponent: public Component {
339 public:
340 void Init(int32 input_dim, int32 output_dim);
341 explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
342 Init(input_dim, output_dim);
343 }
344 virtual int32 Properties() const {
345 return kSimpleComponent|kLinearInInput;
346 }
347 SumReduceComponent(): input_dim_(0), output_dim_(0) { }
348 virtual std::string Type() const { return "SumReduceComponent"; }
349 virtual void InitFromConfig(ConfigLine *cfl);
350 virtual int32 InputDim() const { return input_dim_; }
351 virtual int32 OutputDim() const { return output_dim_; }
352 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
353 const CuMatrixBase<BaseFloat> &in,
354 CuMatrixBase<BaseFloat> *out) const;
355 virtual void Backprop(const std::string &debug_info,
356 const ComponentPrecomputedIndexes *indexes,
357 const CuMatrixBase<BaseFloat> &, // in_value
358 const CuMatrixBase<BaseFloat> &, // out_value,
359 const CuMatrixBase<BaseFloat> &out_deriv,
360 Component *, // to_update
361 CuMatrixBase<BaseFloat> *in_deriv) const;
362 virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
363 output_dim_); }
365 virtual void Read(std::istream &is, bool binary); // This Read function
366 // requires that the Component has the correct type.
368 /// Write component to stream
369 virtual void Write(std::ostream &os, bool binary) const;
371 protected:
372 int32 input_dim_;
373 int32 output_dim_;
374 };
377 class FixedAffineComponent;
378 class FixedScaleComponent;
379 class PerElementScaleComponent;
380 class PerElementOffsetComponent;
382 // Affine means a linear function plus an offset.
383 // Note: although this class can be instantiated, it also
384 // functions as a base-class for more specialized versions of
385 // AffineComponent.
386 class AffineComponent: public UpdatableComponent {
387 friend class SoftmaxComponent; // Friend declaration relates to mixing up.
388 public:
390 virtual int32 InputDim() const { return linear_params_.NumCols(); }
391 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
393 virtual std::string Info() const;
394 virtual void InitFromConfig(ConfigLine *cfl);
396 AffineComponent() { } // use Init to really initialize.
397 virtual std::string Type() const { return "AffineComponent"; }
398 virtual int32 Properties() const {
399 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
400 kBackpropNeedsInput|kBackpropAdds;
401 }
404 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
405 const CuMatrixBase<BaseFloat> &in,
406 CuMatrixBase<BaseFloat> *out) const;
407 virtual void Backprop(const std::string &debug_info,
408 const ComponentPrecomputedIndexes *indexes,
409 const CuMatrixBase<BaseFloat> &in_value,
410 const CuMatrixBase<BaseFloat> &, // out_value
411 const CuMatrixBase<BaseFloat> &out_deriv,
412 Component *to_update,
413 CuMatrixBase<BaseFloat> *in_deriv) const;
415 virtual void Read(std::istream &is, bool binary);
416 virtual void Write(std::ostream &os, bool binary) const;
418 virtual Component* Copy() const;
421 // Some functions from base-class UpdatableComponent.
422 virtual void Scale(BaseFloat scale);
423 virtual void Add(BaseFloat alpha, const Component &other);
424 virtual void PerturbParams(BaseFloat stddev);
425 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
426 virtual int32 NumParameters() const;
427 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
428 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
430 // Some functions that are specific to this class.
432 // This new function is used when mixing up:
433 virtual void SetParams(const VectorBase<BaseFloat> &bias,
434 const MatrixBase<BaseFloat> &linear);
435 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
436 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
437 explicit AffineComponent(const AffineComponent &other);
438 // The next constructor is used in converting from nnet1.
439 AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
440 const CuVectorBase<BaseFloat> &bias_params,
441 BaseFloat learning_rate);
442 void Init(int32 input_dim, int32 output_dim,
443 BaseFloat param_stddev, BaseFloat bias_stddev);
444 void Init(std::string matrix_filename);
446 // This function resizes the dimensions of the component, setting the
447 // parameters to zero, while leaving any other configuration values the same.
448 virtual void Resize(int32 input_dim, int32 output_dim);
450 // The following functions are used for collapsing multiple layers
451 // together. They return a pointer to a new Component equivalent to
452 // the sequence of two components. We haven't implemented this for
453 // FixedLinearComponent yet.
454 Component *CollapseWithNext(const AffineComponent &next) const ;
455 Component *CollapseWithNext(const FixedAffineComponent &next) const;
456 Component *CollapseWithNext(const FixedScaleComponent &next) const;
457 Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
459 protected:
460 friend class NaturalGradientAffineComponent;
461 // This function Update() is for extensibility; child classes may override
462 // this, e.g. for natural gradient update.
463 virtual void Update(
464 const std::string &debug_info,
465 const CuMatrixBase<BaseFloat> &in_value,
466 const CuMatrixBase<BaseFloat> &out_deriv) {
467 UpdateSimple(in_value, out_deriv);
468 }
469 // UpdateSimple is used when *this is a gradient. Child classes may override
470 // this if needed, but typically won't need to.
471 virtual void UpdateSimple(
472 const CuMatrixBase<BaseFloat> &in_value,
473 const CuMatrixBase<BaseFloat> &out_deriv);
475 const AffineComponent &operator = (const AffineComponent &other); // Disallow.
476 CuMatrix<BaseFloat> linear_params_;
477 CuVector<BaseFloat> bias_params_;
478 };
480 class RepeatedAffineComponent;
482 /// This class implements an affine transform using a block diagonal matrix
483 /// e.g., one whose weight matrix is all zeros except for blocks on the
484 /// diagonal. All these blocks have the same dimensions.
485 /// input-dim: num cols of block diagonal matrix.
486 /// output-dim: num rows of block diagonal matrix.
487 /// num-blocks: number of blocks in diagonal of the matrix.
488 /// num-blocks must divide both input-dim and output-dim
489 class BlockAffineComponent : public UpdatableComponent {
490 public:
491 virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
492 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
494 virtual std::string Info() const;
495 virtual void InitFromConfig(ConfigLine *cfl);
497 BlockAffineComponent() { }
498 virtual std::string Type() const { return "BlockAffineComponent"; }
499 virtual int32 Properties() const {
500 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
501 kBackpropNeedsInput|kBackpropAdds;
502 }
504 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
505 const CuMatrixBase<BaseFloat> &in,
506 CuMatrixBase<BaseFloat> *out) const;
508 virtual void Backprop(const std::string &debug_info,
509 const ComponentPrecomputedIndexes *indexes,
510 const CuMatrixBase<BaseFloat> &in_value,
511 const CuMatrixBase<BaseFloat> &, // out_value
512 const CuMatrixBase<BaseFloat> &out_deriv,
513 Component *to_update,
514 CuMatrixBase<BaseFloat> *in_deriv) const;
516 virtual void Read(std::istream &is, bool binary);
517 virtual void Write(std::ostream &os, bool binary) const;
519 virtual Component* Copy() const;
521 // Functions from base-class UpdatableComponent.
522 virtual void Scale(BaseFloat scale);
523 virtual void Add(BaseFloat alpha, const Component &other);
524 virtual void PerturbParams(BaseFloat stddev);
525 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
526 virtual int32 NumParameters() const;
527 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
528 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
530 // BlockAffine-specific functions.
531 void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
532 BaseFloat param_stddev, BaseFloat bias_mean,
533 BaseFloat bias_stddev);
534 explicit BlockAffineComponent(const BlockAffineComponent &other);
535 explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
536 protected:
537 // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
538 // equal size. The blocks are stored in linear_params_ as
539 // [ M
540 // N
541 // O ] but we actually treat it as the matrix:
542 // [ M 0 0
543 // 0 N 0
544 // 0 0 O ]
545 CuMatrix<BaseFloat> linear_params_;
546 CuVector<BaseFloat> bias_params_;
547 int32 num_blocks_;
548 private:
549 const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
550 };
552 class RepeatedAffineComponent: public UpdatableComponent {
553 public:
555 virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
556 virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
558 virtual std::string Info() const;
559 virtual void InitFromConfig(ConfigLine *cfl);
561 RepeatedAffineComponent() { } // use Init to really initialize.
562 virtual std::string Type() const { return "RepeatedAffineComponent"; }
563 virtual int32 Properties() const {
564 return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
565 kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
566 }
567 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
568 const CuMatrixBase<BaseFloat> &in,
569 CuMatrixBase<BaseFloat> *out) const;
570 virtual void Backprop(const std::string &debug_info,
571 const ComponentPrecomputedIndexes *indexes,
572 const CuMatrixBase<BaseFloat> &in_value,
573 const CuMatrixBase<BaseFloat> &, // out_value
574 const CuMatrixBase<BaseFloat> &out_deriv,
575 Component *to_update,
576 CuMatrixBase<BaseFloat> *in_deriv) const;
578 virtual void Read(std::istream &is, bool binary);
579 virtual void Write(std::ostream &os, bool binary) const;
581 virtual Component* Copy() const;
583 // Some functions from base-class UpdatableComponent.
584 virtual void Scale(BaseFloat scale);
585 virtual void Add(BaseFloat alpha, const Component &other);
586 virtual void PerturbParams(BaseFloat stddev);
587 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
588 virtual int32 NumParameters() const;
589 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
590 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
592 // Some functions that are specific to this class.
593 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
594 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
595 explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
597 void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
598 BaseFloat param_stddev, BaseFloat bias_mean,
599 BaseFloat bias_stddev);
600 friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
601 protected:
602 // This function Update(), called from backprop, is broken out for
603 // extensibility to natural gradient update.
604 virtual void Update(
605 const CuMatrixBase<BaseFloat> &in_value,
606 const CuMatrixBase<BaseFloat> &out_deriv);
608 // This function does nothing here but is redefined in child-class
609 // NaturalGradientRepeatedAffineComponent. This help avoid repeated code.
610 virtual void SetNaturalGradientConfigs() { }
612 const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
613 CuMatrix<BaseFloat> linear_params_;
614 CuVector<BaseFloat> bias_params_;
615 int32 num_repeats_;
616 };
618 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
619 public:
620 // Use Init() to really initialize.
621 NaturalGradientRepeatedAffineComponent() { }
623 // Most of the public functions are inherited from RepeatedAffineComponent.
624 virtual std::string Type() const {
625 return "NaturalGradientRepeatedAffineComponent";
626 }
628 virtual Component* Copy() const;
630 // Copy constructor
631 explicit NaturalGradientRepeatedAffineComponent(
632 const NaturalGradientRepeatedAffineComponent &other);
633 private:
634 virtual void Update(
635 const CuMatrixBase<BaseFloat> &in_value,
636 const CuMatrixBase<BaseFloat> &out_deriv);
638 const NaturalGradientRepeatedAffineComponent &operator=(
639 const NaturalGradientRepeatedAffineComponent &other); // Disallow.
641 // Applies the default configuration to preconditioner_in_.
642 virtual void SetNaturalGradientConfigs();
644 // For efficiency reasons we only apply the natural gradient to the input
645 // side, i.e. not to the space of output derivatives-- we believe the input
646 // side is the more important side. We don't make the natural-gradient
647 // configurable; we just give it a reasonable configuration.
648 // Instead of using the individual data-points, for efficiency reasons we use
649 // the distribution of per-minibatch summed derivatives over each dimension of
650 // the output space, as the source for the Fisher matrix.
651 OnlineNaturalGradient preconditioner_in_;
652 };
654 class SoftmaxComponent: public NonlinearComponent {
655 public:
656 explicit SoftmaxComponent(const SoftmaxComponent &other):
657 NonlinearComponent(other) { }
658 SoftmaxComponent() { }
659 virtual std::string Type() const { return "SoftmaxComponent"; }
660 virtual int32 Properties() const {
661 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
662 }
663 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
664 const CuMatrixBase<BaseFloat> &in,
665 CuMatrixBase<BaseFloat> *out) const;
666 virtual void Backprop(const std::string &debug_info,
667 const ComponentPrecomputedIndexes *indexes,
668 const CuMatrixBase<BaseFloat> &in_value,
669 const CuMatrixBase<BaseFloat> &out_value,
670 const CuMatrixBase<BaseFloat> &out_deriv,
671 Component *to_update,
672 CuMatrixBase<BaseFloat> *in_deriv) const;
673 virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
675 virtual Component* Copy() const { return new SoftmaxComponent(*this); }
676 private:
677 SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
678 };
680 class LogSoftmaxComponent: public NonlinearComponent {
681 public:
682 explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
683 NonlinearComponent(other) { }
684 LogSoftmaxComponent() { }
685 virtual std::string Type() const { return "LogSoftmaxComponent"; }
686 virtual int32 Properties() const {
687 return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
688 }
689 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
690 const CuMatrixBase<BaseFloat> &in,
691 CuMatrixBase<BaseFloat> *out) const;
692 virtual void Backprop(const std::string &debug_info,
693 const ComponentPrecomputedIndexes *indexes,
694 const CuMatrixBase<BaseFloat> &in_value,
695 const CuMatrixBase<BaseFloat> &out_value,
696 const CuMatrixBase<BaseFloat> &out_deriv,
697 Component *to_update,
698 CuMatrixBase<BaseFloat> *in_deriv) const;
700 virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
701 private:
702 LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
703 };
705 /// Keywords: natural gradient descent, NG-SGD, naturalgradient. For
706 /// the top-level of the natural gradient code look here, and also in
707 /// nnet-precondition-online.h.
708 /// NaturalGradientAffineComponent is
709 /// a version of AffineComponent that has a non-(multiple of unit) learning-rate
710 /// matrix. See nnet-precondition-online.h for a description of the technique.
711 /// It is described, under the name Online NG-SGD, in the paper "Parallel
712 /// training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
713 /// workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
714 class NaturalGradientAffineComponent: public AffineComponent {
715 public:
716 virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
717 virtual void Read(std::istream &is, bool binary);
718 virtual void Write(std::ostream &os, bool binary) const;
719 void Init(int32 input_dim, int32 output_dim,
720 BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
721 int32 rank_in, int32 rank_out, int32 update_period,
722 BaseFloat num_samples_history, BaseFloat alpha,
723 BaseFloat max_change_per_sample);
724 void Init(int32 rank_in, int32 rank_out, int32 update_period,
725 BaseFloat num_samples_history,
726 BaseFloat alpha, BaseFloat max_change_per_sample,
727 std::string matrix_filename);
728 // this constructor does not really initialize, use Init() or Read().
729 NaturalGradientAffineComponent();
730 virtual void Resize(int32 input_dim, int32 output_dim);
731 virtual void InitFromConfig(ConfigLine *cfl);
732 virtual std::string Info() const;
733 virtual Component* Copy() const;
734 virtual void Scale(BaseFloat scale);
735 virtual void Add(BaseFloat alpha, const Component &other);
736 // copy constructor
737 explicit NaturalGradientAffineComponent(
738 const NaturalGradientAffineComponent &other);
739 virtual void ZeroStats();
741 private:
742 // disallow assignment operator.
743 NaturalGradientAffineComponent &operator= (
744 const NaturalGradientAffineComponent&);
746 // Configs for preconditioner. The input side tends to be better conditioned ->
747 // smaller rank needed, so make them separately configurable.
748 int32 rank_in_;
749 int32 rank_out_;
750 int32 update_period_;
751 BaseFloat num_samples_history_;
752 BaseFloat alpha_;
754 OnlineNaturalGradient preconditioner_in_;
756 OnlineNaturalGradient preconditioner_out_;
758 // If > 0, max_change_per_sample_ is the maximum amount of parameter
759 // change (in L2 norm) that we allow per sample, averaged over the minibatch.
760 // This was introduced in order to control instability.
761 // Instead of the exact L2 parameter change, for
762 // efficiency purposes we limit a bound on the exact
763 // change. The limit is applied via a constant <= 1.0
764 // for each minibatch, A suitable value might be, for
765 // example, 10 or so; larger if there are more
766 // parameters.
767 BaseFloat max_change_per_sample_;
769 // update_count_ records how many updates we have done.
770 double update_count_;
772 // active_scaling_count_ records how many updates we have done,
773 // where the scaling factor is active (not 1.0).
774 double active_scaling_count_;
776 // max_change_scale_stats_ records the sum of scaling factors
777 // in each update, so we can compute the averaged scaling factor
778 // in Info().
779 double max_change_scale_stats_;
781 // Sets the configs rank, alpha and eta in the preconditioner objects,
782 // from the class variables.
783 void SetNaturalGradientConfigs();
785 virtual void Update(
786 const std::string &debug_info,
787 const CuMatrixBase<BaseFloat> &in_value,
788 const CuMatrixBase<BaseFloat> &out_deriv);
789 };
792 /// FixedAffineComponent is an affine transform that is supplied
793 /// at network initialization time and is not trainable.
794 class FixedAffineComponent: public Component {
795 public:
796 FixedAffineComponent() { }
797 virtual std::string Type() const { return "FixedAffineComponent"; }
798 virtual std::string Info() const;
800 // Copy constructor from AffineComponent-- can be used when we're done
801 // training a particular part of the model and want to efficiently disable
802 // further training.
803 FixedAffineComponent(const AffineComponent &c);
805 /// matrix should be of size input-dim+1 to output-dim, last col is offset
806 void Init(const CuMatrixBase<BaseFloat> &matrix);
808 // The ConfigLine cfl contains just the option matrix=<string>,
809 // where the string is the filename of a Kaldi-format matrix to read.
810 virtual void InitFromConfig(ConfigLine *cfl);
812 virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
813 virtual int32 InputDim() const { return linear_params_.NumCols(); }
814 virtual int32 OutputDim() const { return linear_params_.NumRows(); }
816 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
817 const CuMatrixBase<BaseFloat> &in,
818 CuMatrixBase<BaseFloat> *out) const;
819 virtual void Backprop(const std::string &debug_info,
820 const ComponentPrecomputedIndexes *indexes,
821 const CuMatrixBase<BaseFloat> &in_value,
822 const CuMatrixBase<BaseFloat> &, // out_value
823 const CuMatrixBase<BaseFloat> &out_deriv,
824 Component *to_update,
825 CuMatrixBase<BaseFloat> *in_deriv) const;
828 virtual Component* Copy() const;
829 virtual void Read(std::istream &is, bool binary);
830 virtual void Write(std::ostream &os, bool binary) const;
832 // Function to provide access to linear_params_.
833 const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
834 protected:
835 friend class AffineComponent;
836 CuMatrix<BaseFloat> linear_params_;
837 CuVector<BaseFloat> bias_params_;
839 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
840 };
842 /// SumGroupComponent is used to sum up groups of posteriors.
843 /// It's used to introduce a kind of Gaussian-mixture-model-like
844 /// idea into neural nets. This is basically a degenerate case of
845 /// MixtureProbComponent; we had to implement it separately to
846 /// be efficient for CUDA (we can use this one regardless whether
847 /// we have CUDA or not; it's the normal case we want anyway).
848 ///
849 /// There are two forms of initialization in a config file: one
850 /// where the number of elements are specified for each group
851 /// individually as a vector, and one where only the total input
852 /// dimension and the output dimension (number of groups) is specified.
853 /// The second is used when all groups have the same size.
854 class SumGroupComponent: public Component {
855 public:
856 virtual int32 InputDim() const { return input_dim_; }
857 virtual int32 OutputDim() const { return output_dim_; }
858 void Init(const std::vector<int32> &sizes); // the vector is of the input dim
859 // (>= 1) for each output dim.
860 void Init(int32 input_dim, int32 output_dim);
861 void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
862 // each output-dim, how many
863 // inputs were summed over.
864 virtual void InitFromConfig(ConfigLine *cfl);
865 SumGroupComponent() { }
866 virtual std::string Type() const { return "SumGroupComponent"; }
867 virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
868 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
869 const CuMatrixBase<BaseFloat> &in,
870 CuMatrixBase<BaseFloat> *out) const;
871 virtual void Backprop(const std::string &debug_info,
872 const ComponentPrecomputedIndexes *indexes,
873 const CuMatrixBase<BaseFloat> &in_value,
874 const CuMatrixBase<BaseFloat> &, // out_value
875 const CuMatrixBase<BaseFloat> &out_deriv,
876 Component *to_update,
877 CuMatrixBase<BaseFloat> *in_deriv) const;
878 virtual Component* Copy() const;
879 virtual void Read(std::istream &is, bool binary);
880 virtual void Write(std::ostream &os, bool binary) const;
882 private:
883 KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
884 // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
885 // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
886 CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
887 // index.
888 CuArray<int32> reverse_indexes_; // for each input index, the output index.
889 int32 input_dim_;
890 int32 output_dim_;
891 };
894 /// FixedScaleComponent applies a fixed per-element scale; it's similar
895 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
896 /// model conversion).
897 class FixedScaleComponent: public Component {
898 public:
899 FixedScaleComponent() { }
900 virtual std::string Type() const { return "FixedScaleComponent"; }
901 virtual std::string Info() const;
902 virtual int32 Properties() const {
903 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
904 }
906 void Init(const CuVectorBase<BaseFloat> &scales);
908 // The ConfigLine cfl contains only the option scales=<string>,
909 // where the string is the filename of a Kaldi-format matrix to read.
910 virtual void InitFromConfig(ConfigLine *cfl);
912 virtual int32 InputDim() const { return scales_.Dim(); }
913 virtual int32 OutputDim() const { return scales_.Dim(); }
915 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
916 const CuMatrixBase<BaseFloat> &in,
917 CuMatrixBase<BaseFloat> *out) const;
918 virtual void Backprop(const std::string &debug_info,
919 const ComponentPrecomputedIndexes *indexes,
920 const CuMatrixBase<BaseFloat> &, // in_value
921 const CuMatrixBase<BaseFloat> &, // out_value
922 const CuMatrixBase<BaseFloat> &out_deriv,
923 Component *, // to_update
924 CuMatrixBase<BaseFloat> *in_deriv) const;
925 virtual Component* Copy() const;
926 virtual void Read(std::istream &is, bool binary);
927 virtual void Write(std::ostream &os, bool binary) const;
929 protected:
930 friend class AffineComponent; // necessary for collapse
931 CuVector<BaseFloat> scales_;
932 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
933 };
936 /// FixedBiasComponent applies a fixed per-element bias; it's similar
937 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
938 /// model conversion.
939 class FixedBiasComponent: public Component {
940 public:
941 FixedBiasComponent() { }
942 virtual std::string Type() const { return "FixedBiasComponent"; }
943 virtual std::string Info() const;
945 virtual int32 Properties() const {
946 return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
947 }
949 void Init(const CuVectorBase<BaseFloat> &scales);
951 // The ConfigLine cfl contains only the option bias=<string>,
952 // where the string is the filename of a Kaldi-format matrix to read.
953 virtual void InitFromConfig(ConfigLine *cfl);
954 virtual int32 InputDim() const { return bias_.Dim(); }
955 virtual int32 OutputDim() const { return bias_.Dim(); }
956 using Component::Propagate; // to avoid name hiding
957 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
958 const CuMatrixBase<BaseFloat> &in,
959 CuMatrixBase<BaseFloat> *out) const;
960 virtual void Backprop(const std::string &debug_info,
961 const ComponentPrecomputedIndexes *indexes,
962 const CuMatrixBase<BaseFloat> &, // in_value,
963 const CuMatrixBase<BaseFloat> &, // out_value
964 const CuMatrixBase<BaseFloat> &out_deriv,
965 Component *, // to_update
966 CuMatrixBase<BaseFloat> *in_deriv) const;
967 virtual Component* Copy() const;
968 virtual void Read(std::istream &is, bool binary);
969 virtual void Write(std::ostream &os, bool binary) const;
971 protected:
972 CuVector<BaseFloat> bias_;
973 KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
974 };
976 // NoOpComponent just duplicates its input. We don't anticipate this being used
977 // very often, but it may sometimes make your life easier
978 class NoOpComponent: public NonlinearComponent {
979 public:
980 explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
981 NoOpComponent() { }
982 virtual std::string Type() const { return "NoOpComponent"; }
983 virtual int32 Properties() const {
984 return kSimpleComponent|kLinearInInput|kPropagateInPlace;
985 }
986 virtual Component* Copy() const { return new NoOpComponent(*this); }
987 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
988 const CuMatrixBase<BaseFloat> &in,
989 CuMatrixBase<BaseFloat> *out) const;
990 virtual void Backprop(const std::string &debug_info,
991 const ComponentPrecomputedIndexes *indexes,
992 const CuMatrixBase<BaseFloat> &, //in_value
993 const CuMatrixBase<BaseFloat> &, // out_value,
994 const CuMatrixBase<BaseFloat> &out_deriv,
995 Component *to_update,
996 CuMatrixBase<BaseFloat> *in_deriv) const;
997 private:
998 NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
999 };
1001 // ClipGradientComponent just duplicates its input, but clips gradients
1002 // during backpropagation if they cross a predetermined threshold.
1003 // This component will be used to prevent gradient explosion problem in
1004 // recurrent neural networks
1005 class ClipGradientComponent: public Component {
1006 public:
1007 ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1008 bool norm_based_clipping,
1009 BaseFloat self_repair_clipped_proportion_threshold,
1010 BaseFloat self_repair_target,
1011 BaseFloat self_repair_scale,
1012 int32 num_clipped,
1013 int32 count,
1014 int32 num_self_repaired,
1015 int32 num_backpropped) {
1016 Init(dim, clipping_threshold, norm_based_clipping,
1017 self_repair_clipped_proportion_threshold,
1018 self_repair_target,
1019 self_repair_scale,
1020 num_clipped, count,
1021 num_self_repaired, num_backpropped);}
1023 ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1024 norm_based_clipping_(false),
1025 self_repair_clipped_proportion_threshold_(1.0),
1026 self_repair_target_(0.0),
1027 self_repair_scale_(0.0),
1028 num_clipped_(0), count_(0),
1029 num_self_repaired_(0), num_backpropped_(0) { }
1031 virtual int32 InputDim() const { return dim_; }
1032 virtual int32 OutputDim() const { return dim_; }
1033 virtual void InitFromConfig(ConfigLine *cfl);
1034 void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1035 BaseFloat self_repair_clipped_proportion_threshold,
1036 BaseFloat self_repair_target,
1037 BaseFloat self_repair_scale,
1038 int32 num_clipped, int32 count,
1039 int32 num_self_repaired, int32 num_backpropped);
1041 virtual std::string Type() const { return "ClipGradientComponent"; }
1043 virtual int32 Properties() const {
1044 return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1045 kBackpropNeedsInput;
1046 }
1048 virtual void ZeroStats();
1050 virtual Component* Copy() const {
1051 return new ClipGradientComponent(dim_,
1052 clipping_threshold_,
1053 norm_based_clipping_,
1054 self_repair_clipped_proportion_threshold_,
1055 self_repair_target_,
1056 self_repair_scale_,
1057 num_clipped_,
1058 count_,
1059 num_self_repaired_,
1060 num_backpropped_);}
1062 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1063 const CuMatrixBase<BaseFloat> &in,
1064 CuMatrixBase<BaseFloat> *out) const;
1065 virtual void Backprop(const std::string &debug_info,
1066 const ComponentPrecomputedIndexes *indexes,
1067 const CuMatrixBase<BaseFloat> &in_value,
1068 const CuMatrixBase<BaseFloat> &, // out_value,
1069 const CuMatrixBase<BaseFloat> &out_deriv,
1070 Component *to_update,
1071 CuMatrixBase<BaseFloat> *in_deriv) const;
1073 virtual void Scale(BaseFloat scale);
1074 virtual void Add(BaseFloat alpha, const Component &other);
1075 virtual void Read(std::istream &is, bool binary); // This Read function
1076 // requires that the Component has the correct type.
1077 /// Write component to stream
1078 virtual void Write(std::ostream &os, bool binary) const;
1079 virtual std::string Info() const;
1080 virtual ~ClipGradientComponent() {
1081 if (num_self_repaired_ > 0)
1082 KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1083 << ")'s self-repair was activated " << num_self_repaired_
1084 << " time(s) out of " << num_backpropped_
1085 << " times of calling Backprop() in this training job.";
1086 }
1087 private:
1088 int32 dim_; // input/output dimension
1089 BaseFloat clipping_threshold_; // threshold to be used for clipping
1090 // could correspond to max-row-norm (if
1091 // norm_based_clipping_ == true) or
1092 // max-absolute-value (otherwise)
1093 bool norm_based_clipping_; // if true the max-row-norm will be clipped
1094 // else element-wise absolute value clipping is
1095 // done
1097 // some configuration values relating to self-repairing.
1098 BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1099 // clipped-proportion
1100 // for self-repair to be
1101 // activated
1102 BaseFloat self_repair_target_; // the target value towards which self-repair
1103 // is trying to set for in-deriv
1104 BaseFloat self_repair_scale_; // constant scaling the self-repair vector
1105 std::string debug_info_; // component-node name, used in the destructor to
1106 // print out stats of self-repair
1108 // this function is called from Backprop code, and only does something if the
1109 // self-repair-scale config value is set and the current clipped proportion
1110 // exceeds the threshold. What it does is to add a term to in-deriv that
1111 // forces the input to the ClipGradientComponent to be close to some small
1112 // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1113 // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1114 // small, the parameters on the path will also tend to be small, which may
1115 // help tamp down the divergence caused by gradient explosion.
1116 void RepairGradients(const std::string &debug_info,
1117 const CuMatrixBase<BaseFloat> &in_value,
1118 CuMatrixBase<BaseFloat> *in_deriv,
1119 ClipGradientComponent *to_update) const;
1121 ClipGradientComponent &operator =
1122 (const ClipGradientComponent &other); // Disallow.
1124 protected:
1125 // variables to store stats
1126 // An element corresponds to rows of derivative matrix, when
1127 // norm_based_clipping_ is true,
1128 // else it corresponds to each element of the derivative matrix
1129 // Note: no stats are stored when norm_based_clipping_ is false
1130 int32 num_clipped_; // number of elements which were clipped
1131 int32 count_; // number of elements which were processed
1132 int32 num_self_repaired_; // number of times self-repair is activated
1133 int32 num_backpropped_; //number of times backprop is called
1135 };
1137 /** PermuteComponent changes the order of the columns (i.e. the feature or
1138 activation dimensions). Output dimension i is mapped to input dimension
1139 column_map_[i], so it's like doing:
1140 for each row:
1141 for each feature/activation dimension i:
1142 output(row, i) = input(row, column_map_[i]).
1144 */
1145 class PermuteComponent: public Component {
1146 public:
1147 PermuteComponent() {}
1148 PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1150 virtual int32 InputDim() const { return column_map_.Dim(); }
1151 virtual int32 OutputDim() const { return column_map_.Dim(); }
1152 virtual void InitFromConfig(ConfigLine *cfl);
1153 void Init(const std::vector<int32> &column_map);
1155 virtual std::string Type() const { return "PermuteComponent"; }
1157 virtual int32 Properties() const {
1158 return kSimpleComponent|kLinearInInput;
1159 }
1161 virtual void ZeroStats() {}
1163 virtual Component* Copy() const;
1165 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1166 const CuMatrixBase<BaseFloat> &in,
1167 CuMatrixBase<BaseFloat> *out) const;
1168 virtual void Backprop(const std::string &debug_info,
1169 const ComponentPrecomputedIndexes *indexes,
1170 const CuMatrixBase<BaseFloat> &, //in_value
1171 const CuMatrixBase<BaseFloat> &, // out_value,
1172 const CuMatrixBase<BaseFloat> &out_deriv,
1173 Component *to_update,
1174 CuMatrixBase<BaseFloat> *in_deriv) const;
1176 virtual void Scale(BaseFloat scale) {}
1177 virtual void Add(BaseFloat alpha, const Component &other) {}
1178 virtual void Read(std::istream &is, bool binary); // This Read function
1179 // requires that the Component has the correct type.
1180 /// Write component to stream
1181 virtual void Write(std::ostream &os, bool binary) const;
1182 virtual std::string Info() const;
1183 private:
1184 // computes the reverse column map. Must not be called if column_map_.Dim()
1185 // == 0
1186 void ComputeReverseColumnMap();
1187 CuArray<int32> column_map_;
1188 // the following is a derived variable, not written to disk.
1189 // It is used in backprop.
1190 CuArray<int32> reverse_column_map_;
1191 PermuteComponent &operator =
1192 (const PermuteComponent &other); // Disallow.
1193 };
1198 // PerElementScaleComponent scales each dimension of its input with a separate
1199 // trainable scale; it's like a linear component with a diagonal matrix.
1200 class PerElementScaleComponent: public UpdatableComponent {
1201 public:
1202 virtual int32 InputDim() const { return scales_.Dim(); }
1203 virtual int32 OutputDim() const { return scales_.Dim(); }
1205 virtual std::string Info() const;
1206 virtual void InitFromConfig(ConfigLine *cfl);
1208 PerElementScaleComponent() { } // use Init to really initialize.
1209 virtual std::string Type() const { return "PerElementScaleComponent"; }
1210 virtual int32 Properties() const {
1211 return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1212 kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1213 }
1215 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1216 const CuMatrixBase<BaseFloat> &in,
1217 CuMatrixBase<BaseFloat> *out) const;
1218 virtual void Backprop(const std::string &debug_info,
1219 const ComponentPrecomputedIndexes *indexes,
1220 const CuMatrixBase<BaseFloat> &in_value,
1221 const CuMatrixBase<BaseFloat> &, // out_value
1222 const CuMatrixBase<BaseFloat> &out_deriv,
1223 Component *to_update,
1224 CuMatrixBase<BaseFloat> *in_deriv) const;
1226 virtual void Read(std::istream &is, bool binary);
1227 virtual void Write(std::ostream &os, bool binary) const;
1229 virtual Component* Copy() const;
1232 // Some functions from base-class UpdatableComponent.
1233 virtual void Scale(BaseFloat scale);
1234 virtual void Add(BaseFloat alpha, const Component &other);
1235 virtual void PerturbParams(BaseFloat stddev);
1236 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1237 virtual int32 NumParameters() const;
1238 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1239 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1241 // Some functions that are specific to this class.
1242 explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1244 void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1245 void Init(std::string vector_filename);
1247 protected:
1248 friend class AffineComponent; // necessary for collapse
1249 // This function Update() is for extensibility; child classes may override
1250 // this, e.g. for natural gradient update.
1251 virtual void Update(
1252 const std::string &debug_info,
1253 const CuMatrixBase<BaseFloat> &in_value,
1254 const CuMatrixBase<BaseFloat> &out_deriv) {
1255 UpdateSimple(in_value, out_deriv);
1256 }
1257 // UpdateSimple is used when *this is a gradient. Child classes may override
1258 // this if needed, but typically won't need to.
1259 virtual void UpdateSimple(
1260 const CuMatrixBase<BaseFloat> &in_value,
1261 const CuMatrixBase<BaseFloat> &out_deriv);
1263 const PerElementScaleComponent &operator
1264 = (const PerElementScaleComponent &other); // Disallow.
1265 CuVector<BaseFloat> scales_;
1266 };
1269 // PerElementOffsetComponent offsets each dimension of its input with a separate
1270 // trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
1271 class PerElementOffsetComponent: public UpdatableComponent {
1272 public:
1273 virtual int32 InputDim() const { return offsets_.Dim(); }
1274 virtual int32 OutputDim() const { return offsets_.Dim(); }
1276 virtual std::string Info() const;
1277 virtual void InitFromConfig(ConfigLine *cfl);
1279 PerElementOffsetComponent() { } // use Init to really initialize.
1280 virtual std::string Type() const { return "PerElementOffsetComponent"; }
1281 virtual int32 Properties() const {
1282 return kSimpleComponent|kUpdatableComponent|
1283 kBackpropInPlace|kPropagateInPlace;
1284 }
1286 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1287 const CuMatrixBase<BaseFloat> &in,
1288 CuMatrixBase<BaseFloat> *out) const;
1289 virtual void Backprop(const std::string &debug_info,
1290 const ComponentPrecomputedIndexes *indexes,
1291 const CuMatrixBase<BaseFloat> &, // in_value
1292 const CuMatrixBase<BaseFloat> &, // out_value
1293 const CuMatrixBase<BaseFloat> &out_deriv,
1294 Component *to_update,
1295 CuMatrixBase<BaseFloat> *in_deriv) const;
1297 virtual void Read(std::istream &is, bool binary);
1298 virtual void Write(std::ostream &os, bool binary) const;
1300 virtual Component* Copy() const;
1303 // Some functions from base-class UpdatableComponent.
1304 virtual void Scale(BaseFloat scale);
1305 virtual void Add(BaseFloat alpha, const Component &other);
1306 virtual void PerturbParams(BaseFloat stddev);
1307 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1308 virtual int32 NumParameters() const;
1309 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1310 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1312 // Some functions that are specific to this class.
1313 explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1315 void Init(int32 dim, BaseFloat param_mean,
1316 BaseFloat param_stddev);
1317 void Init(std::string vector_filename);
1319 protected:
1320 const PerElementOffsetComponent &operator
1321 = (const PerElementOffsetComponent &other); // Disallow.
1322 CuVector<BaseFloat> offsets_;
1323 };
1326 // ConstantFunctionComponent returns constant function of its input,
1327 // i.e. its output does not depend on its input. It is the same as
1328 // an affine component with the linear term fixed at zero.
1329 // It is optionally trainable, and optionally you can use natural
1330 // gradient. The input is required only because it's more convenient
1331 // to make SimpleComponents [but see ConstantComponent, which requires
1332 // no inputs].
1333 class ConstantFunctionComponent: public UpdatableComponent {
1334 public:
1335 virtual int32 InputDim() const { return input_dim_; }
1336 virtual int32 OutputDim() const { return output_.Dim(); }
1338 virtual std::string Info() const;
1339 // possible parameter values with their defaults:
1340 // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1341 // output-mean=0 output-stddev=0
1342 virtual void InitFromConfig(ConfigLine *cfl);
1344 ConstantFunctionComponent();
1346 ConstantFunctionComponent(const ConstantFunctionComponent &other);
1348 virtual std::string Type() const { return "ConstantFunctionComponent"; }
1349 virtual int32 Properties() const {
1350 return kSimpleComponent|
1351 (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1352 (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1353 kBackpropAdds;
1354 }
1355 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1356 const CuMatrixBase<BaseFloat> &in,
1357 CuMatrixBase<BaseFloat> *out) const;
1358 virtual void Backprop(const std::string &debug_info,
1359 const ComponentPrecomputedIndexes *indexes,
1360 const CuMatrixBase<BaseFloat> &, // in_value
1361 const CuMatrixBase<BaseFloat> &, // out_value
1362 const CuMatrixBase<BaseFloat> &out_deriv,
1363 Component *to_update,
1364 CuMatrixBase<BaseFloat> *in_deriv) const;
1366 virtual void Read(std::istream &is, bool binary);
1367 virtual void Write(std::ostream &os, bool binary) const;
1369 virtual Component* Copy() const;
1371 // Some functions from base-class UpdatableComponent.
1372 virtual void Scale(BaseFloat scale);
1373 virtual void Add(BaseFloat alpha, const Component &other);
1374 virtual void PerturbParams(BaseFloat stddev);
1375 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1376 virtual int32 NumParameters() const;
1377 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1378 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1379 private:
1380 int32 input_dim_;
1381 // the output value-- a vector.
1382 CuVector<BaseFloat> output_;
1384 bool is_updatable_;
1385 // if true, and if updatable, do natural-gradient update.
1386 bool use_natural_gradient_;
1387 OnlineNaturalGradient preconditioner_;
1389 const ConstantFunctionComponent &operator
1390 = (const ConstantFunctionComponent &other); // Disallow.
1391 };
1395 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1396 // it uses a natural gradient update for the per-element scales, and enforces a
1397 // maximum amount of change per minibatch, for stability.
1398 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1399 public:
1401 virtual std::string Info() const;
1403 virtual void InitFromConfig(ConfigLine *cfl);
1405 NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1406 virtual std::string Type() const {
1407 return "NaturalGradientPerElementScaleComponent";
1408 }
1410 virtual void Read(std::istream &is, bool binary);
1411 virtual void Write(std::ostream &os, bool binary) const;
1413 virtual Component* Copy() const;
1415 // Some functions that are specific to this class:
1416 explicit NaturalGradientPerElementScaleComponent(
1417 const NaturalGradientPerElementScaleComponent &other);
1419 void Init(int32 dim, BaseFloat param_mean,
1420 BaseFloat param_stddev, int32 rank, int32 update_period,
1421 BaseFloat num_samples_history, BaseFloat alpha,
1422 BaseFloat max_change_per_minibatch);
1423 void Init(std::string vector_filename,
1424 int32 rank, int32 update_period, BaseFloat num_samples_history,
1425 BaseFloat alpha, BaseFloat max_change_per_minibatch);
1427 private:
1428 // configuration value for imposing max-change...
1429 BaseFloat max_change_per_minibatch_;
1431 // unlike the NaturalGradientAffineComponent, there is only one dimension to
1432 // consider as the parameters are a vector not a matrix, so we only need one
1433 // preconditioner.
1434 // The preconditioner stores its own configuration values; we write and read
1435 // these, but not the preconditioner object itself.
1436 OnlineNaturalGradient preconditioner_;
1438 // Override of the parent-class Update() function, called only
1439 // if this->is_gradient_ = false; this implements the natural
1440 // gradient update.
1441 virtual void Update(
1442 const std::string &debug_info,
1443 const CuMatrixBase<BaseFloat> &in_value,
1444 const CuMatrixBase<BaseFloat> &out_deriv);
1446 const NaturalGradientPerElementScaleComponent &operator
1447 = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1448 };
1450 /**
1451 * ConvolutionalComponent implements 2d-convolution.
1452 * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1453 * 2 dimensions as it has same size as the input along the 3rd dimension.
1454 * Input : A matrix where each row is a vectorized 3D-tensor.
1455 * The 3D tensor has dimensions
1456 * x: (e.g. time)
1457 * y: (e.g. frequency)
1458 * z: (e.g. channels like features/delta/delta-delta)
1459 *
1460 * The component supports input vectorizations of type zyx and yzx.
1461 * The default vectorization type is zyx.
1462 * e.g. for input vectorization of type zyx the input is vectorized by
1463 * spanning axes z, y and x of the tensor in that order.
1464 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1465 * the zyx vectorized input looks like
1466 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1467 *
1468 *
1469 * Output : The output is also a 3D tensor vectorized in the zyx format.
1470 * The channel axis (z) in the output corresponds to the output of
1471 * different filters. The first channel corresponds to the first filter
1472 * i.e., first row of the filter_params_ matrix.
1473 *
1474 * Note: The component has to support yzx input vectorization as the binaries
1475 * like add-deltas generate yz vectorized output. These input vectors are
1476 * concatenated using the Append descriptor across time steps to form a yzx
1477 * vectorized 3D tensor input.
1478 * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1479 *
1480 *
1481 * For information on the hyperparameters and parameters of this component see
1482 * the variable declarations.
1483 *
1484 * Propagation:
1485 * ------------
1486 * Convolution operation consists of a dot-products between the filter tensor
1487 * and input tensor patch, for various shifts of filter tensor along the x and y
1488 * axes input tensor. (Note: there is no shift along z-axis as the filter and
1489 * input tensor have same size along this axis).
1490 *
1491 * For a particular shift (i,j) of the filter tensor
1492 * along input tensor dimensions x and y, the elements of the input tensor which
1493 * overlap with the filter form the input tensor patch. This patch is vectorized
1494 * in zyx format. All the patches corresponding to various samples in the
1495 * mini-batch are stacked into a matrix, where each row corresponds to one
1496 * patch. Let this matrix be represented by X_{i,j}. The dot products with
1497 * various filters are computed simultaneously by computing the matrix product
1498 * with the filter_params_ matrix (W)
1499 * Y_{i,j} = X_{i,j}*W^T.
1500 * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1501 *
1502 * All the matrix products corresponding to various shifts (i,j) of the
1503 * filter tensor are computed simultaneously using the AddMatMatBatched
1504 * call of CuMatrixBase class.
1505 *
1506 * BackPropagation:
1507 * ----------------
1508 * Backpropagation to compute the input derivative (\nabla X_{i,j})
1509 * consists of the a series of matrix products.
1510 * \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1511 * output derivative for a particular shift of the filter.
1512 *
1513 * Once again these matrix products are computed simultaneously.
1514 *
1515 * Update:
1516 * -------
1517 * The weight gradient is computed as
1518 * \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1519 *
1520 */
1521 class ConvolutionComponent: public UpdatableComponent {
1522 public:
1523 enum TensorVectorizationType {
1524 kYzx = 0,
1525 kZyx = 1
1526 };
1528 ConvolutionComponent();
1529 // constructor using another component
1530 ConvolutionComponent(const ConvolutionComponent &component);
1531 // constructor using parameters
1532 ConvolutionComponent(
1533 const CuMatrixBase<BaseFloat> &filter_params,
1534 const CuVectorBase<BaseFloat> &bias_params,
1535 int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1536 int32 filt_x_dim, int32 filt_y_dim,
1537 int32 filt_x_step, int32 filt_y_step,
1538 TensorVectorizationType input_vectorization,
1539 BaseFloat learning_rate);
1541 virtual int32 InputDim() const;
1542 virtual int32 OutputDim() const;
1544 virtual std::string Info() const;
1545 virtual void InitFromConfig(ConfigLine *cfl);
1546 virtual std::string Type() const { return "ConvolutionComponent"; }
1547 virtual int32 Properties() const {
1548 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1549 kBackpropAdds|kPropagateAdds;
1550 }
1552 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1553 const CuMatrixBase<BaseFloat> &in,
1554 CuMatrixBase<BaseFloat> *out) const;
1555 virtual void Backprop(const std::string &debug_info,
1556 const ComponentPrecomputedIndexes *indexes,
1557 const CuMatrixBase<BaseFloat> &in_value,
1558 const CuMatrixBase<BaseFloat> &, // out_value,
1559 const CuMatrixBase<BaseFloat> &out_deriv,
1560 Component *to_update_in,
1561 CuMatrixBase<BaseFloat> *in_deriv) const;
1562 void Update(const std::string &debug_info,
1563 const CuMatrixBase<BaseFloat> &in_value,
1564 const CuMatrixBase<BaseFloat> &out_deriv,
1565 const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1568 virtual void Read(std::istream &is, bool binary);
1569 virtual void Write(std::ostream &os, bool binary) const;
1571 virtual Component* Copy() const;
1573 // Some functions from base-class UpdatableComponent.
1574 virtual void Scale(BaseFloat scale);
1575 virtual void Add(BaseFloat alpha, const Component &other);
1576 virtual void PerturbParams(BaseFloat stddev);
1577 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1578 virtual int32 NumParameters() const;
1579 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1580 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1582 // Some functions that are specific to this class.
1583 void SetParams(const VectorBase<BaseFloat> &bias,
1584 const MatrixBase<BaseFloat> &filter);
1585 const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1586 const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1587 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1588 int32 filt_x_dim, int32 filt_y_dim,
1589 int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1590 TensorVectorizationType input_vectorization,
1591 BaseFloat param_stddev, BaseFloat bias_stddev);
1592 // there is no filt_z_dim parameter as the length of the filter along
1593 // z-dimension is same as the input
1594 void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1595 int32 filt_x_dim, int32 filt_y_dim,
1596 int32 filt_x_step, int32 filt_y_step,
1597 TensorVectorizationType input_vectorization,
1598 std::string matrix_filename);
1600 // resize the component, setting the parameters to zero, while
1601 // leaving any other configuration values the same
1602 void Resize(int32 input_dim, int32 output_dim);
1604 void Update(const std::string &debug_info,
1605 const CuMatrixBase<BaseFloat> &in_value,
1606 const CuMatrixBase<BaseFloat> &out_deriv);
1609 private:
1610 int32 input_x_dim_; // size of the input along x-axis
1611 // (e.g. number of time steps)
1613 int32 input_y_dim_; // size of input along y-axis
1614 // (e.g. number of mel-frequency bins)
1616 int32 input_z_dim_; // size of input along z-axis
1617 // (e.g. number of channels is 3 if the input has
1618 // features + delta + delta-delta features
1620 int32 filt_x_dim_; // size of the filter along x-axis
1622 int32 filt_y_dim_; // size of the filter along y-axis
1624 // there is no filt_z_dim_ as it is always assumed to be
1625 // the same as input_z_dim_
1627 int32 filt_x_step_; // the number of steps taken along x-axis of input
1628 // before computing the next dot-product
1629 // of filter and input
1631 int32 filt_y_step_; // the number of steps taken along y-axis of input
1632 // before computing the next dot-product of the filter
1633 // and input
1635 // there is no filt_z_step_ as only dot product is possible along this axis
1637 TensorVectorizationType input_vectorization_; // type of vectorization of the
1638 // input 3D tensor. Accepts zyx and yzx formats
1640 CuMatrix<BaseFloat> filter_params_;
1641 // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1642 // where each row in the matrix corresponds to one filter.
1643 // The 3D filter tensor is vectorizedin zyx format.
1644 // The first row of the matrix corresponds to the first filter and so on.
1645 // Keep in mind the vectorization type and order of filters when using file
1646 // based initialization.
1648 CuVector<BaseFloat> bias_params_;
1649 // the filter-specific bias vector (i.e., there is a seperate bias added
1650 // to the output of each filter).
1651 bool is_gradient_;
1653 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1654 CuMatrix<BaseFloat> *patches) const;
1655 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1656 CuMatrixBase<BaseFloat> *in_deriv) const;
1657 const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1658 };
1661 // LstmNonlinearityComponent is a component that implements part of an LSTM, by
1662 // combining together the sigmoids and tanh's, plus some diagonal terms, into
1663 // a single block.
1664 // We will refer to the LSTM formulation used in
1665 //
1666 // Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1667 // by H. Sak et al,
1668 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1669 //
1670 // Suppose the cell dimension is C. Then outside this component, we compute
1671 // the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1672 // matrix multiplication:
1673 //
1674 // i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1675 // f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1676 // c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1677 // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1678 //
1679 // The part of the computation that takes place in this component is as follows.
1680 // Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
1681 // c_{t-1}). Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1682 //
1683 // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1684 //
1685 //
1686 // This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1687 // and w_o.
1688 //
1689 //
1690 // In the forward pass (Propagate), this component computes the following:
1691 //
1692 // i_t = Sigmoid(i_part + w_{ic}*c_{t-1}) (1)
1693 // f_t = Sigmoid(f_part + w_{fc}*c_{t-1}) (2)
1694 // c_t = f_t*c_{t-1} + i_t * Tanh(c_part) (3)
1695 // o_t = Sigmoid(o_part + w_{oc}*c_t) (4)
1696 // m_t = o_t * Tanh(c_t) (5)
1697 // # note: the outputs are just c_t and m_t.
1698 //
1699 // The backprop is as you would think, but for the "self-repair" we need to pass
1700 // in additional vectors (of the same dim as the parameters of the layer) that
1701 // dictate whether or not we add an additional term to the backpropagated
1702 // derivatives. (This term helps force the input to the nonlinearities into the
1703 // range where the derivatives are not too small).
1704 //
1705 // This component stores stats of the same form as are normally stored by the
1706 // StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1707 // activations and derivatives, but this is done inside the Backprop() functions.
1708 // [the StoreStats() functions don't take the input data as an argument, so
1709 // storing this data that way is impossible, and anyway it's more efficient to
1710 // do it as part of backprop.]
1711 class LstmNonlinearityComponent: public UpdatableComponent {
1712 public:
1714 virtual int32 InputDim() const;
1715 virtual int32 OutputDim() const;
1716 virtual std::string Info() const;
1717 virtual void InitFromConfig(ConfigLine *cfl);
1718 LstmNonlinearityComponent() { } // use Init to really initialize.
1719 virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1720 virtual int32 Properties() const {
1721 return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1722 }
1724 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1725 const CuMatrixBase<BaseFloat> &in,
1726 CuMatrixBase<BaseFloat> *out) const;
1727 virtual void Backprop(const std::string &debug_info,
1728 const ComponentPrecomputedIndexes *indexes,
1729 const CuMatrixBase<BaseFloat> &in_value,
1730 const CuMatrixBase<BaseFloat> &, // out_value,
1731 const CuMatrixBase<BaseFloat> &out_deriv,
1732 Component *to_update_in,
1733 CuMatrixBase<BaseFloat> *in_deriv) const;
1735 virtual void Read(std::istream &is, bool binary);
1736 virtual void Write(std::ostream &os, bool binary) const;
1738 virtual Component* Copy() const;
1740 // Some functions from base-class UpdatableComponent.
1741 virtual void Scale(BaseFloat scale);
1742 virtual void Add(BaseFloat alpha, const Component &other);
1743 virtual void PerturbParams(BaseFloat stddev);
1744 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1745 virtual int32 NumParameters() const;
1746 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1747 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
1748 virtual void ZeroStats();
1750 // Some functions that are specific to this class:
1751 explicit LstmNonlinearityComponent(
1752 const LstmNonlinearityComponent &other);
1754 void Init(int32 cell_dim, BaseFloat param_stddev,
1755 BaseFloat tanh_self_repair_threshold,
1756 BaseFloat sigmoid_self_repair_threshold,
1757 BaseFloat self_repair_scale);
1759 void Init(std::string vector_filename,
1760 int32 rank, int32 update_period, BaseFloat num_samples_history,
1761 BaseFloat alpha, BaseFloat max_change_per_minibatch);
1763 private:
1765 // Initializes the natural-gradient object with the configuration we
1766 // use for this object, which for now is hardcoded at the C++ level.
1767 void InitNaturalGradient();
1770 // Notation: C is the cell dimension; it equals params_.NumCols().
1772 // The dimension of the parameter matrix is (3 x C);
1773 // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1774 CuMatrix<BaseFloat> params_;
1776 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1777 // equations (1) through (5), this is the sum of the values of the nonliearities
1778 // (used for diagnostics only). It is comparable to value_sum_ vector
1779 // in base-class NonlinearComponent.
1780 CuMatrix<double> value_sum_;
1782 // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1783 // equations (1) through (5), this is the sum of the derivatives of the
1784 // nonliearities (used for diagnostics and to control self-repair). It is
1785 // comparable to the deriv_sum_ vector in base-class
1786 // NonlinearComponent.
1787 CuMatrix<double> deriv_sum_;
1789 // This matrix has dimension 10. The contents are a block of 5 self-repair
1790 // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1791 // self-repair scales (typically all 0.00001). These are for each of the 5
1792 // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1793 // more info).
1794 CuVector<BaseFloat> self_repair_config_;
1796 // This matrix has dimension 5. For each of the 5 nonlinearities in the LSTM
1797 // component (see comments in cu-math.h for more info), it contains the total,
1798 // over all frames represented in count_, of the number of dimensions that
1799 // were subject to self_repair. To get the self-repair proportion you should
1800 // divide by (count_ times cell_dim_).
1801 CuVector<double> self_repair_total_;
1803 // The total count (number of frames) corresponding to the stats in value_sum_
1804 // and deriv_sum_.
1805 double count_;
1807 // Preconditioner for the parameters of this component [operates in the space
1808 // of dimension C].
1809 // The preconditioner stores its own configuration values; we write and read
1810 // these, but not the preconditioner object itself.
1811 OnlineNaturalGradient preconditioner_;
1813 const LstmNonlinearityComponent &operator
1814 = (const LstmNonlinearityComponent &other); // Disallow.
1815 };
1820 /*
1821 * MaxPoolingComponent :
1822 * Maxpooling component was firstly used in ConvNet for selecting an
1823 * representative activation in an area. It inspired Maxout nonlinearity.
1824 * Each output element of this component is the maximum of a block of
1825 * input elements where the block has a 3D dimension (pool_x_size_,
1826 * pool_y_size_, pool_z_size_).
1827 * Blocks could overlap if the shift value on any axis is smaller
1828 * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
1829 * If the shift values are euqal to their pool size, there is no
1830 * overlap; while if they all equal 1, the blocks overlap to
1831 * the greatest possible extent.
1832 *
1833 * This component is designed to be used after a ConvolutionComponent
1834 * so that the input matrix is propagated from a 2d-convolutional layer.
1835 * This component implements 3d-maxpooling which performs
1836 * max pooling along the three axes.
1837 * Input : A matrix where each row is a vectorized 3D-tensor.
1838 * The 3D tensor has dimensions
1839 * x: (e.g. time)
1840 * y: (e.g. frequency)
1841 * z: (e.g. channels like number of filters in the ConvolutionComponent)
1842 *
1843 * The component assumes input vectorizations of type zyx
1844 * which is the default output vectorization type of a ConvolutionComponent.
1845 * e.g. for input vectorization of type zyx the input is vectorized by
1846 * spanning axes z, y and x of the tensor in that order.
1847 * Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1848 * the zyx vectorized input looks like
1849 * A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1850 *
1851 * Output : The output is also a 3D tensor vectorized in the zyx format.
1852 *
1853 * For information on the hyperparameters and parameters of this component see
1854 * the variable declarations.
1855 *
1856 *
1857 */
1859 class MaxpoolingComponent: public Component {
1860 public:
1862 MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
1863 pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
1864 pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
1865 // constructor using another component
1866 MaxpoolingComponent(const MaxpoolingComponent &component);
1868 virtual int32 InputDim() const;
1869 virtual int32 OutputDim() const;
1870 virtual void Check() const;
1872 virtual std::string Info() const;
1873 virtual void InitFromConfig(ConfigLine *cfl);
1874 virtual std::string Type() const { return "MaxpoolingComponent"; }
1875 virtual int32 Properties() const {
1876 return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
1877 kBackpropAdds;
1878 }
1880 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1881 const CuMatrixBase<BaseFloat> &in,
1882 CuMatrixBase<BaseFloat> *out) const;
1883 virtual void Backprop(const std::string &debug_info,
1884 const ComponentPrecomputedIndexes *indexes,
1885 const CuMatrixBase<BaseFloat> &in_value,
1886 const CuMatrixBase<BaseFloat> &out_value,
1887 const CuMatrixBase<BaseFloat> &out_deriv,
1888 Component *, // to_update,
1889 CuMatrixBase<BaseFloat> *in_deriv) const;
1891 virtual void Read(std::istream &is, bool binary); // This Read function
1892 // requires that the Component has the correct type.
1894 /// Write component to stream
1895 virtual void Write(std::ostream &os, bool binary) const;
1896 virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
1898 void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1899 CuMatrix<BaseFloat> *patches) const;
1900 void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1901 CuMatrixBase<BaseFloat> *in_deriv) const;
1903 protected:
1904 int32 input_x_dim_; // size of the input along x-axis
1905 // (e.g. number of time steps)
1906 int32 input_y_dim_; // size of input along y-axis
1907 // (e.g. number of mel-frequency bins)
1908 int32 input_z_dim_; // size of input along z-axis
1909 // (e.g. number of filters in the ConvolutionComponent)
1911 int32 pool_x_size_; // size of the pooling window along x-axis
1912 int32 pool_y_size_; // size of the pooling window along y-axis
1913 int32 pool_z_size_; // size of the pooling window along z-axis
1915 int32 pool_x_step_; // the number of steps taken along x-axis of input
1916 // before computing the next pool
1917 int32 pool_y_step_; // the number of steps taken along y-axis of input
1918 // before computing the next pool
1919 int32 pool_z_step_; // the number of steps taken along z-axis of input
1920 // before computing the next pool
1922 };
1925 /**
1926 CompositeComponent is a component representing a sequence of
1927 [simple] components. The config line would be something like the following
1928 (imagine this is all on one line):
1930 component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
1931 component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
1932 component2='type=RectifiedLinearComponent dim=10000' \
1933 component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
1935 The reason you might want to use this component, instead of directly using
1936 the same sequence of components in the config file, is to save GPU memory (at
1937 the expense of more compute)-- because doing it like this means we have to
1938 re-do parts of the forward pass in the backprop phase, but we avoid using
1939 much memory for very long (and you can make the memory usage very small by
1940 making max-rows-process small). We inherit from UpdatableComponent just in
1941 case one or more of the components in the sequence are updatable.
1943 It is an error to nest a CompositeComponent inside a CompositeComponent.
1944 The same effect can be accomplished by specifying a smaller max-rows-process
1945 in a single CompositeComponent.
1946 */
1947 class CompositeComponent: public UpdatableComponent {
1948 public:
1949 virtual int32 InputDim() const;
1950 virtual int32 OutputDim() const;
1952 virtual std::string Info() const;
1954 virtual void InitFromConfig(ConfigLine *cfl);
1956 virtual Component* Copy() const;
1958 CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
1960 // Initialize from this list of components; takes ownership of the pointers.
1961 void Init(const std::vector<Component*> &components,
1962 int32 max_rows_process);
1964 virtual std::string Type() const { return "CompositeComponent"; }
1966 // The properties depend on the properties of the constituent components. As
1967 // a special case, we never return kStoresStats in the properties: by default
1968 // we store things like activation stats (e.g. for nonlinear components like
1969 // ReLU) as part of the backprop. This means we may wastefully store stats
1970 // even when not requested, but it does save time as a separate StoreStats()
1971 // call would involve propagating the internals.
1972 virtual int32 Properties() const;
1974 virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1975 const CuMatrixBase<BaseFloat> &in,
1976 CuMatrixBase<BaseFloat> *out) const;
1977 virtual void Backprop(const std::string &debug_info,
1978 const ComponentPrecomputedIndexes *indexes,
1979 const CuMatrixBase<BaseFloat> &in_value,
1980 const CuMatrixBase<BaseFloat> &, // out_value
1981 const CuMatrixBase<BaseFloat> &out_deriv,
1982 Component *to_update,
1983 CuMatrixBase<BaseFloat> *in_deriv) const;
1985 // note, we don't implement StoreStats() as it would be inefficient. Instead,
1986 // by default we call StoreStats() on all members that have the flag set,
1987 // inside the Backprop.
1988 virtual void ZeroStats();
1990 virtual void Read(std::istream &is, bool binary);
1991 virtual void Write(std::ostream &os, bool binary) const;
1993 // Don't implement Copy() at this level: implement it in the child class.
1995 // Some functions from base-class UpdatableComponent.
1996 virtual void SetUnderlyingLearningRate(BaseFloat lrate);
1997 virtual void SetActualLearningRate(BaseFloat lrate);
1998 virtual void SetAsGradient();
1999 virtual void Scale(BaseFloat scale);
2000 virtual void Add(BaseFloat alpha, const Component &other);
2001 virtual void PerturbParams(BaseFloat stddev);
2002 virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
2003 virtual int32 NumParameters() const;
2004 virtual void Vectorize(VectorBase<BaseFloat> *params) const;
2005 virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
2007 // note: we dont implement the StoreStats function as it would be quite
2008 // expensive; instead, by default we call StoreStats() for any components that
2009 // want to store stats, as part of the backprop pass. This is not 100% ideal
2010 // but it will usually do what you want. We can revisit this later if needed.
2012 // Functions to iterate over the internal components
2014 int32 NumComponents() const { return components_.size();}
2015 /// Gets the ith component in this component.
2016 /// The ordering is the same as in the config line. The caller
2017 /// does not own the received component.
2018 const Component* GetComponent(int32 i) const;
2019 /// Sets the ith component. After this call, CompositeComponent owns
2020 /// the reference to the argument component. Frees the previous
2021 /// ith component.
2022 void SetComponent(int32 i, Component *component);
2024 virtual ~CompositeComponent() { DeletePointers(&components_); }
2025 private:
2026 // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2027 // at the output of the i'th component.
2028 inline MatrixStrideType GetStrideType(int32 i) const;
2030 // returns true if at least one of 'components_' returns the kUpdatable flag
2031 // in its flags.
2032 bool IsUpdatable() const;
2034 // the maximum number of
2035 int32 max_rows_process_;
2036 std::vector<Component*> components_;
2038 };
2041 } // namespace nnet3
2042 } // namespace kaldi
2045 #endif