src/nnet3/nnet-simple-component.h

   1 // nnet3/nnet-simple-component.h
   2
   3 // Copyright 2011-2013  Karel Vesely
   4 //           2012-2015  Johns Hopkins University (author: Daniel Povey)
   5 //                2013  Xiaohui Zhang
   6 //           2014-2015  Vijayaditya Peddinti
   7 //           2014-2015  Guoguo Chen
   8 //                2015  Daniel Galvez
   9 //                2015  Tom Ko
  10
  11 // See ../../COPYING for clarification regarding multiple authors
  12 //
  13 // Licensed under the Apache License, Version 2.0 (the "License");
  14 // you may not use this file except in compliance with the License.
  15 // You may obtain a copy of the License at
  16 //
  17 //  http://www.apache.org/licenses/LICENSE-2.0
  18 //
  19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  22 // MERCHANTABLITY OR NON-INFRINGEMENT.
  23 // See the Apache 2 License for the specific language governing permissions and
  24 // limitations under the License.
  25
  26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
  27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
  28
  29 #include "nnet3/nnet-common.h"
  30 #include "nnet3/nnet-component-itf.h"
  31 #include "nnet3/natural-gradient-online.h"
  32 #include <iostream>
  33
  34 namespace kaldi {
  35 namespace nnet3 {
  36
  37 /// @file  nnet-simple-component.h
  38 ///   This file contains declarations of components that are "simple", meaning
  39 ///   they don't care about the indexes they are operating on, produce one
  40 ///   output for one input, and return the kSimpleComponent flag in their
  41 ///   Properties(): for example, tanh and affine components.  In
  42 ///   nnet-general-component.h there are components that don't fit this pattern.
  43
  44 // This "nnet3" version of the p-norm component only supports the 2-norm.
  45 class PnormComponent: public Component {
  46  public:
  47   void Init(int32 input_dim, int32 output_dim);
  48   explicit PnormComponent(int32 input_dim, int32 output_dim) {
  49     Init(input_dim, output_dim);
  50   }
  51   virtual int32 Properties() const {
  52     return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
  53   }
  54   PnormComponent(): input_dim_(0), output_dim_(0) { }
  55   virtual std::string Type() const { return "PnormComponent"; }
  56   virtual void InitFromConfig(ConfigLine *cfl);
  57   virtual int32 InputDim() const { return input_dim_; }
  58   virtual int32 OutputDim() const { return output_dim_; }
  59   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
  60                          const CuMatrixBase<BaseFloat> &in,
  61                          CuMatrixBase<BaseFloat> *out) const;
  62   virtual void Backprop(const std::string &debug_info,
  63                         const ComponentPrecomputedIndexes *indexes,
  64                         const CuMatrixBase<BaseFloat> &in_value,
  65                         const CuMatrixBase<BaseFloat> &out_value,
  66                         const CuMatrixBase<BaseFloat> &out_deriv,
  67                         Component *to_update,
  68                         CuMatrixBase<BaseFloat> *in_deriv) const;
  69   virtual Component* Copy() const { return new PnormComponent(input_dim_,
  70                                                               output_dim_); }
  71
  72   virtual void Read(std::istream &is, bool binary); // This Read function
  73   // requires that the Component has the correct type.
  74
  75   /// Write component to stream
  76   virtual void Write(std::ostream &os, bool binary) const;
  77
  78  protected:
  79   int32 input_dim_;
  80   int32 output_dim_;
  81 };
  82
  83 // This component randomly zeros dropout_proportion of the input
  84 // and the derivatives are backpropagated through the nonzero inputs.
  85 // Typically this component used during training but not in test time.
  86 // The idea is described under the name Dropout, in the paper
  87 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
  88 class DropoutComponent : public RandomComponent {
  89  public:
  90   void Init(int32 dim, BaseFloat dropout_proportion = 0.0);
  91
  92   DropoutComponent(int32 dim, BaseFloat dropout = 0.0) { Init(dim, dropout); }
  93
  94   DropoutComponent(): dim_(0), dropout_proportion_(0.0) { }
  95
  96   virtual int32 Properties() const {
  97     return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
  98   }
  99   virtual std::string Type() const { return "DropoutComponent"; }
 100
 101   virtual void InitFromConfig(ConfigLine *cfl);
 102
 103   virtual int32 InputDim() const { return dim_; }
 104
 105   virtual int32 OutputDim() const { return dim_; }
 106
 107   virtual void Read(std::istream &is, bool binary);
 108
 109   // Write component to stream
 110   virtual void Write(std::ostream &os, bool binary) const;
 111
 112   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 113                          const CuMatrixBase<BaseFloat> &in,
 114                          CuMatrixBase<BaseFloat> *out) const;
 115   virtual void Backprop(const std::string &debug_info,
 116                         const ComponentPrecomputedIndexes *indexes,
 117                         const CuMatrixBase<BaseFloat> &in_value,
 118                         const CuMatrixBase<BaseFloat> &out_value,
 119                         const CuMatrixBase<BaseFloat> &out_deriv,
 120                         Component *to_update,
 121                         CuMatrixBase<BaseFloat> *in_deriv) const;
 122   virtual Component* Copy() const { return new DropoutComponent(dim_,
 123                                                                 dropout_proportion_); }
 124   virtual std::string Info() const;
 125
 126   void SetDropoutProportion(BaseFloat dropout_proportion) { dropout_proportion_ = dropout_proportion; }
 127
 128  private:
 129   int32 dim_;
 130   /// dropout-proportion is the proportion that is dropped out,
 131   /// e.g. if 0.1, we set 10% to zero value.
 132   BaseFloat dropout_proportion_;
 133
 134 };
 135
 136 class ElementwiseProductComponent: public Component {
 137  public:
 138   void Init(int32 input_dim, int32 output_dim);
 139   explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
 140     Init(input_dim, output_dim);
 141   }
 142   virtual int32 Properties() const {
 143     return kSimpleComponent|kBackpropNeedsInput;
 144   }
 145   ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
 146   virtual std::string Type() const { return "ElementwiseProductComponent"; }
 147   virtual void InitFromConfig(ConfigLine *cfl);
 148   virtual int32 InputDim() const { return input_dim_; }
 149   virtual int32 OutputDim() const { return output_dim_; }
 150   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 151                          const CuMatrixBase<BaseFloat> &in,
 152                          CuMatrixBase<BaseFloat> *out) const;
 153   virtual void Backprop(const std::string &debug_info,
 154                         const ComponentPrecomputedIndexes *indexes,
 155                         const CuMatrixBase<BaseFloat> &in_value,
 156                         const CuMatrixBase<BaseFloat> &out_value,
 157                         const CuMatrixBase<BaseFloat> &out_deriv,
 158                         Component *to_update,
 159                         CuMatrixBase<BaseFloat> *in_deriv) const;
 160   virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
 161                                                               output_dim_); }
 162
 163   virtual void Read(std::istream &is, bool binary); // This Read function
 164   // requires that the Component has the correct type.
 165
 166   /// Write component to stream
 167   virtual void Write(std::ostream &os, bool binary) const;
 168
 169  protected:
 170   int32 input_dim_;
 171   int32 output_dim_;
 172 };
 173
 174 class NormalizeComponent: public Component {
 175  public:
 176  void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
 177   explicit NormalizeComponent(int32 input_dim,
 178                               BaseFloat target_rms = 1.0,
 179                               bool add_log_stddev = false) {
 180     Init(input_dim, target_rms, add_log_stddev);
 181   }
 182   explicit NormalizeComponent(const NormalizeComponent &other);
 183   // note: there is some special code in NonlinerComponent::Info() that
 184   // specifically caters to this class.
 185   virtual int32 Properties() const {
 186     return (add_log_stddev_ ?
 187             kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
 188             kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
 189             kBackpropAdds|kBackpropInPlace);
 190   }
 191   NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
 192   virtual std::string Type() const { return "NormalizeComponent"; }
 193   virtual void InitFromConfig(ConfigLine *cfl);
 194   virtual Component* Copy() const { return new NormalizeComponent(*this); }
 195   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 196                          const CuMatrixBase<BaseFloat> &in,
 197                          CuMatrixBase<BaseFloat> *out) const;
 198   virtual void Backprop(const std::string &debug_info,
 199                         const ComponentPrecomputedIndexes *indexes,
 200                         const CuMatrixBase<BaseFloat> &in_value,
 201                         const CuMatrixBase<BaseFloat> &, // out_value
 202                         const CuMatrixBase<BaseFloat> &out_deriv,
 203                         Component *to_update,
 204                         CuMatrixBase<BaseFloat> *in_deriv) const;
 205
 206   virtual void Read(std::istream &is, bool binary);
 207   virtual void Write(std::ostream &os, bool binary) const;
 208   virtual int32 InputDim() const { return input_dim_; }
 209   virtual int32 OutputDim() const {
 210     return (input_dim_ + (add_log_stddev_ ? 1 : 0));
 211   }
 212   virtual std::string Info() const;
 213  private:
 214   NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
 215   enum { kExpSquaredNormFloor = -66 };
 216   static const BaseFloat kSquaredNormFloor;
 217   int32 input_dim_;
 218   BaseFloat target_rms_; // The target rms for outputs.
 219   // about 0.7e-20.  We need a value that's exactly representable in
 220   // float and whose inverse square root is also exactly representable
 221   // in float (hence, an even power of two).
 222
 223   bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
 224                         // is an extra dimension of the output.
 225 };
 226
 227
 228 class SigmoidComponent: public NonlinearComponent {
 229  public:
 230   explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
 231   SigmoidComponent() { }
 232   virtual std::string Type() const { return "SigmoidComponent"; }
 233   virtual int32 Properties() const {
 234     return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
 235   }
 236   virtual Component* Copy() const { return new SigmoidComponent(*this); }
 237   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 238                          const CuMatrixBase<BaseFloat> &in,
 239                          CuMatrixBase<BaseFloat> *out) const;
 240   virtual void Backprop(const std::string &debug_info,
 241                         const ComponentPrecomputedIndexes *indexes,
 242                         const CuMatrixBase<BaseFloat> &, //in_value
 243                         const CuMatrixBase<BaseFloat> &out_value,
 244                         const CuMatrixBase<BaseFloat> &out_deriv,
 245                         Component *to_update,
 246                         CuMatrixBase<BaseFloat> *in_deriv) const;
 247   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 248  private:
 249   // this function is called from Backprop code and only does something if the
 250   // self-repair-scale config value is set.
 251   void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
 252                        CuMatrixBase<BaseFloat> *in_deriv,
 253                        SigmoidComponent *to_update) const;
 254
 255   SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
 256 };
 257
 258 class TanhComponent: public NonlinearComponent {
 259  public:
 260   explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
 261   TanhComponent() { }
 262   virtual std::string Type() const { return "TanhComponent"; }
 263   virtual Component* Copy() const { return new TanhComponent(*this); }
 264   virtual int32 Properties() const {
 265     return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
 266   }
 267   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 268                          const CuMatrixBase<BaseFloat> &in,
 269                          CuMatrixBase<BaseFloat> *out) const;
 270   virtual void Backprop(const std::string &debug_info,
 271                         const ComponentPrecomputedIndexes *indexes,
 272                         const CuMatrixBase<BaseFloat> &, //in_value
 273                         const CuMatrixBase<BaseFloat> &out_value,
 274                         const CuMatrixBase<BaseFloat> &out_deriv,
 275                         Component *to_update,
 276                         CuMatrixBase<BaseFloat> *in_deriv) const;
 277   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 278  private:
 279   // this function is called from Backprop code and only does something if the
 280   // self-repair-scale config value is set.
 281   void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
 282                        CuMatrixBase<BaseFloat> *in_deriv,
 283                        TanhComponent *to_update) const;
 284
 285   TanhComponent &operator = (const TanhComponent &other); // Disallow.
 286 };
 287
 288
 289 class RectifiedLinearComponent: public NonlinearComponent {
 290  public:
 291   explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
 292       NonlinearComponent(other) { }
 293   RectifiedLinearComponent() { }
 294   virtual std::string Type() const { return "RectifiedLinearComponent"; }
 295   virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
 296   virtual int32 Properties() const {
 297     return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
 298         kStoresStats;
 299   }
 300   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 301                          const CuMatrixBase<BaseFloat> &in,
 302                          CuMatrixBase<BaseFloat> *out) const;
 303   virtual void Backprop(const std::string &debug_info,
 304                         const ComponentPrecomputedIndexes *indexes,
 305                         const CuMatrixBase<BaseFloat> &, //in_value
 306                         const CuMatrixBase<BaseFloat> &out_value,
 307                         const CuMatrixBase<BaseFloat> &out_deriv,
 308                         Component *to_update,
 309                         CuMatrixBase<BaseFloat> *in_deriv) const;
 310   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 311
 312  private:
 313   // this function is called from Backprop code and only does something if the
 314   // self-repair-scale config value is set.
 315   void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
 316                        RectifiedLinearComponent *to_update) const;
 317
 318   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 319 };
 320
 321 /**
 322    This component is a fixed (non-trainable) nonlinearity that sums its inputs
 323    to produce outputs.  Currently the only supported configuration is that its
 324    input-dim is interpreted as consisting of n blocks, and the output is just a
 325    summation over the n blocks, where  n = input-dim / output-dim, so for instance
 326     output[n] = input[n] + input[block-size + n] + .... .
 327    Later if needed we can add a configuration variable that allows you to sum
 328    over 'interleaved' input.
 329  */
 330 class SumReduceComponent: public Component {
 331  public:
 332   void Init(int32 input_dim, int32 output_dim);
 333   explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
 334     Init(input_dim, output_dim);
 335   }
 336   virtual int32 Properties() const {
 337     return kSimpleComponent|kLinearInInput;
 338   }
 339   SumReduceComponent(): input_dim_(0), output_dim_(0) { }
 340   virtual std::string Type() const { return "SumReduceComponent"; }
 341   virtual void InitFromConfig(ConfigLine *cfl);
 342   virtual int32 InputDim() const { return input_dim_; }
 343   virtual int32 OutputDim() const { return output_dim_; }
 344   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 345                          const CuMatrixBase<BaseFloat> &in,
 346                          CuMatrixBase<BaseFloat> *out) const;
 347   virtual void Backprop(const std::string &debug_info,
 348                         const ComponentPrecomputedIndexes *indexes,
 349                         const CuMatrixBase<BaseFloat> &, // in_value
 350                         const CuMatrixBase<BaseFloat> &, // out_value,
 351                         const CuMatrixBase<BaseFloat> &out_deriv,
 352                         Component *, // to_update
 353                         CuMatrixBase<BaseFloat> *in_deriv) const;
 354   virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
 355                                                                   output_dim_); }
 356
 357   virtual void Read(std::istream &is, bool binary); // This Read function
 358   // requires that the Component has the correct type.
 359
 360   /// Write component to stream
 361   virtual void Write(std::ostream &os, bool binary) const;
 362
 363  protected:
 364   int32 input_dim_;
 365   int32 output_dim_;
 366 };
 367
 368
 369 class FixedAffineComponent;
 370 class FixedScaleComponent;
 371 class PerElementScaleComponent;
 372 class PerElementOffsetComponent;
 373
 374 // Affine means a linear function plus an offset.
 375 // Note: although this class can be instantiated, it also
 376 // functions as a base-class for more specialized versions of
 377 // AffineComponent.
 378 class AffineComponent: public UpdatableComponent {
 379   friend class SoftmaxComponent; // Friend declaration relates to mixing up.
 380  public:
 381
 382   virtual int32 InputDim() const { return linear_params_.NumCols(); }
 383   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 384
 385   virtual std::string Info() const;
 386   virtual void InitFromConfig(ConfigLine *cfl);
 387
 388   AffineComponent() { } // use Init to really initialize.
 389   virtual std::string Type() const { return "AffineComponent"; }
 390   virtual int32 Properties() const {
 391     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 392         kBackpropNeedsInput|kBackpropAdds;
 393   }
 394
 395
 396   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 397                          const CuMatrixBase<BaseFloat> &in,
 398                          CuMatrixBase<BaseFloat> *out) const;
 399   virtual void Backprop(const std::string &debug_info,
 400                         const ComponentPrecomputedIndexes *indexes,
 401                         const CuMatrixBase<BaseFloat> &in_value,
 402                         const CuMatrixBase<BaseFloat> &, // out_value
 403                         const CuMatrixBase<BaseFloat> &out_deriv,
 404                         Component *to_update,
 405                         CuMatrixBase<BaseFloat> *in_deriv) const;
 406
 407   virtual void Read(std::istream &is, bool binary);
 408   virtual void Write(std::ostream &os, bool binary) const;
 409
 410   virtual Component* Copy() const;
 411
 412
 413   // Some functions from base-class UpdatableComponent.
 414   virtual void Scale(BaseFloat scale);
 415   virtual void Add(BaseFloat alpha, const Component &other);
 416   virtual void PerturbParams(BaseFloat stddev);
 417   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 418   virtual int32 NumParameters() const;
 419   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 420   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 421
 422   // Some functions that are specific to this class.
 423
 424   // This new function is used when mixing up:
 425   virtual void SetParams(const VectorBase<BaseFloat> &bias,
 426                          const MatrixBase<BaseFloat> &linear);
 427   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
 428   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 429   explicit AffineComponent(const AffineComponent &other);
 430   // The next constructor is used in converting from nnet1.
 431   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
 432                   const CuVectorBase<BaseFloat> &bias_params,
 433                   BaseFloat learning_rate);
 434   void Init(int32 input_dim, int32 output_dim,
 435             BaseFloat param_stddev, BaseFloat bias_stddev);
 436   void Init(std::string matrix_filename);
 437
 438   // This function resizes the dimensions of the component, setting the
 439   // parameters to zero, while leaving any other configuration values the same.
 440   virtual void Resize(int32 input_dim, int32 output_dim);
 441
 442   // The following functions are used for collapsing multiple layers
 443   // together.  They return a pointer to a new Component equivalent to
 444   // the sequence of two components.  We haven't implemented this for
 445   // FixedLinearComponent yet.
 446   Component *CollapseWithNext(const AffineComponent &next) const ;
 447   Component *CollapseWithNext(const FixedAffineComponent &next) const;
 448   Component *CollapseWithNext(const FixedScaleComponent &next) const;
 449   Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
 450
 451  protected:
 452   friend class NaturalGradientAffineComponent;
 453   // This function Update() is for extensibility; child classes may override
 454   // this, e.g. for natural gradient update.
 455   virtual void Update(
 456       const std::string &debug_info,
 457       const CuMatrixBase<BaseFloat> &in_value,
 458       const CuMatrixBase<BaseFloat> &out_deriv) {
 459     UpdateSimple(in_value, out_deriv);
 460   }
 461   // UpdateSimple is used when *this is a gradient.  Child classes may override
 462   // this if needed, but typically won't need to.
 463   virtual void UpdateSimple(
 464       const CuMatrixBase<BaseFloat> &in_value,
 465       const CuMatrixBase<BaseFloat> &out_deriv);
 466
 467   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
 468   CuMatrix<BaseFloat> linear_params_;
 469   CuVector<BaseFloat> bias_params_;
 470 };
 471
 472 class RepeatedAffineComponent;
 473
 474 /// This class implements an affine transform using a block diagonal matrix
 475 /// e.g., one whose weight matrix is all zeros except for blocks on the
 476 /// diagonal. All these blocks have the same dimensions.
 477 ///  input-dim: num cols of block diagonal matrix.
 478 ///  output-dim: num rows of block diagonal matrix.
 479 /// num-blocks: number of blocks in diagonal of the matrix.
 480 /// num-blocks must divide both input-dim and output-dim
 481 class BlockAffineComponent : public UpdatableComponent {
 482  public:
 483   virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
 484   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 485
 486   virtual std::string Info() const;
 487   virtual void InitFromConfig(ConfigLine *cfl);
 488
 489   BlockAffineComponent() { }
 490   virtual std::string Type() const { return "BlockAffineComponent"; }
 491   virtual int32 Properties() const {
 492     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 493       kBackpropNeedsInput|kBackpropAdds;
 494   }
 495
 496   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 497                          const CuMatrixBase<BaseFloat> &in,
 498                          CuMatrixBase<BaseFloat> *out) const;
 499
 500   virtual void Backprop(const std::string &debug_info,
 501                         const ComponentPrecomputedIndexes *indexes,
 502                         const CuMatrixBase<BaseFloat> &in_value,
 503                         const CuMatrixBase<BaseFloat> &, // out_value
 504                         const CuMatrixBase<BaseFloat> &out_deriv,
 505                         Component *to_update,
 506                         CuMatrixBase<BaseFloat> *in_deriv) const;
 507
 508   virtual void Read(std::istream &is, bool binary);
 509   virtual void Write(std::ostream &os, bool binary) const;
 510
 511   virtual Component* Copy() const;
 512
 513   // Functions from base-class UpdatableComponent.
 514   virtual void Scale(BaseFloat scale);
 515   virtual void Add(BaseFloat alpha, const Component &other);
 516   virtual void PerturbParams(BaseFloat stddev);
 517   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 518   virtual int32 NumParameters() const;
 519   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 520   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 521
 522   // BlockAffine-specific functions.
 523   void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
 524             BaseFloat param_stddev, BaseFloat bias_mean,
 525             BaseFloat bias_stddev);
 526   explicit BlockAffineComponent(const BlockAffineComponent &other);
 527   explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
 528  protected:
 529   // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
 530   // equal size.  The blocks are stored in linear_params_ as
 531   // [ M
 532   //   N
 533   //   O ] but we actually treat it as the matrix:
 534   // [ M 0 0
 535   //   0 N 0
 536   //   0 0 O ]
 537   CuMatrix<BaseFloat> linear_params_;
 538   CuVector<BaseFloat> bias_params_;
 539   int32 num_blocks_;
 540  private:
 541   const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
 542 };
 543
 544 class RepeatedAffineComponent: public UpdatableComponent {
 545  public:
 546
 547   virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
 548   virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
 549
 550   virtual std::string Info() const;
 551   virtual void InitFromConfig(ConfigLine *cfl);
 552
 553   RepeatedAffineComponent() { } // use Init to really initialize.
 554   virtual std::string Type() const { return "RepeatedAffineComponent"; }
 555   virtual int32 Properties() const {
 556     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 557         kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
 558   }
 559   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 560                          const CuMatrixBase<BaseFloat> &in,
 561                          CuMatrixBase<BaseFloat> *out) const;
 562   virtual void Backprop(const std::string &debug_info,
 563                         const ComponentPrecomputedIndexes *indexes,
 564                         const CuMatrixBase<BaseFloat> &in_value,
 565                         const CuMatrixBase<BaseFloat> &, // out_value
 566                         const CuMatrixBase<BaseFloat> &out_deriv,
 567                         Component *to_update,
 568                         CuMatrixBase<BaseFloat> *in_deriv) const;
 569
 570   virtual void Read(std::istream &is, bool binary);
 571   virtual void Write(std::ostream &os, bool binary) const;
 572
 573   virtual Component* Copy() const;
 574
 575   // Some functions from base-class UpdatableComponent.
 576   virtual void Scale(BaseFloat scale);
 577   virtual void Add(BaseFloat alpha, const Component &other);
 578   virtual void PerturbParams(BaseFloat stddev);
 579   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 580   virtual int32 NumParameters() const;
 581   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 582   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 583
 584   // Some functions that are specific to this class.
 585   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
 586   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 587   explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
 588
 589   void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
 590             BaseFloat param_stddev, BaseFloat bias_mean,
 591             BaseFloat bias_stddev);
 592   friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
 593  protected:
 594   // This function Update(), called from backprop, is broken out for
 595   // extensibility to natural gradient update.
 596   virtual void Update(
 597       const CuMatrixBase<BaseFloat> &in_value,
 598       const CuMatrixBase<BaseFloat> &out_deriv);
 599
 600   // This function does nothing here but is redefined in child-class
 601   // NaturalGradientRepeatedAffineComponent.  This help avoid repeated code.
 602   virtual void SetNaturalGradientConfigs() { }
 603
 604   const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
 605   CuMatrix<BaseFloat> linear_params_;
 606   CuVector<BaseFloat> bias_params_;
 607   int32 num_repeats_;
 608 };
 609
 610 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
 611  public:
 612   // Use Init() to really initialize.
 613   NaturalGradientRepeatedAffineComponent() { }
 614
 615   // Most of the public functions are inherited from RepeatedAffineComponent.
 616   virtual std::string Type() const {
 617     return "NaturalGradientRepeatedAffineComponent";
 618   }
 619
 620   virtual Component* Copy() const;
 621
 622   // Copy constructor
 623   explicit NaturalGradientRepeatedAffineComponent(
 624       const NaturalGradientRepeatedAffineComponent &other);
 625  private:
 626   virtual void Update(
 627       const CuMatrixBase<BaseFloat> &in_value,
 628       const CuMatrixBase<BaseFloat> &out_deriv);
 629
 630   const NaturalGradientRepeatedAffineComponent &operator=(
 631       const NaturalGradientRepeatedAffineComponent &other); // Disallow.
 632
 633   // Applies the default configuration to preconditioner_in_.
 634   virtual void SetNaturalGradientConfigs();
 635
 636   // For efficiency reasons we only apply the natural gradient to the input
 637   // side, i.e. not to the space of output derivatives-- we believe the input
 638   // side is the more important side.  We don't make the natural-gradient
 639   // configurable; we just give it a reasonable configuration.
 640   // Instead of using the individual data-points, for efficiency reasons we use
 641   // the distribution of per-minibatch summed derivatives over each dimension of
 642   // the output space, as the source for the Fisher matrix.
 643   OnlineNaturalGradient preconditioner_in_;
 644 };
 645
 646 class SoftmaxComponent: public NonlinearComponent {
 647  public:
 648   explicit SoftmaxComponent(const SoftmaxComponent &other):
 649       NonlinearComponent(other) { }
 650   SoftmaxComponent() { }
 651   virtual std::string Type() const { return "SoftmaxComponent"; }
 652   virtual int32 Properties() const {
 653     return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
 654   }
 655   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 656                          const CuMatrixBase<BaseFloat> &in,
 657                          CuMatrixBase<BaseFloat> *out) const;
 658   virtual void Backprop(const std::string &debug_info,
 659                         const ComponentPrecomputedIndexes *indexes,
 660                         const CuMatrixBase<BaseFloat> &in_value,
 661                         const CuMatrixBase<BaseFloat> &out_value,
 662                         const CuMatrixBase<BaseFloat> &out_deriv,
 663                         Component *to_update,
 664                         CuMatrixBase<BaseFloat> *in_deriv) const;
 665   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 666
 667   virtual Component* Copy() const { return new SoftmaxComponent(*this); }
 668  private:
 669   SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
 670 };
 671
 672 class LogSoftmaxComponent: public NonlinearComponent {
 673  public:
 674   explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
 675       NonlinearComponent(other) { }
 676   LogSoftmaxComponent() { }
 677   virtual std::string Type() const { return "LogSoftmaxComponent"; }
 678   virtual int32 Properties() const {
 679     return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
 680   }
 681   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 682                          const CuMatrixBase<BaseFloat> &in,
 683                          CuMatrixBase<BaseFloat> *out) const;
 684   virtual void Backprop(const std::string &debug_info,
 685                         const ComponentPrecomputedIndexes *indexes,
 686                         const CuMatrixBase<BaseFloat> &in_value,
 687                         const CuMatrixBase<BaseFloat> &out_value,
 688                         const CuMatrixBase<BaseFloat> &out_deriv,
 689                         Component *to_update,
 690                         CuMatrixBase<BaseFloat> *in_deriv) const;
 691
 692   virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
 693  private:
 694   LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
 695 };
 696
 697 /// Keywords: natural gradient descent, NG-SGD, naturalgradient.  For
 698 /// the top-level of the natural gradient code look here, and also in
 699 /// nnet-precondition-online.h.
 700 /// NaturalGradientAffineComponent is
 701 /// a version of AffineComponent that has a non-(multiple of unit) learning-rate
 702 /// matrix.  See nnet-precondition-online.h for a description of the technique.
 703 /// It is described, under the name Online NG-SGD, in the paper "Parallel
 704 /// training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
 705 /// workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
 706 class NaturalGradientAffineComponent: public AffineComponent {
 707  public:
 708   virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
 709   virtual void Read(std::istream &is, bool binary);
 710   virtual void Write(std::ostream &os, bool binary) const;
 711   void Init(int32 input_dim, int32 output_dim,
 712             BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
 713             int32 rank_in, int32 rank_out, int32 update_period,
 714             BaseFloat num_samples_history, BaseFloat alpha,
 715             BaseFloat max_change_per_sample);
 716   void Init(int32 rank_in, int32 rank_out, int32 update_period,
 717             BaseFloat num_samples_history,
 718             BaseFloat alpha, BaseFloat max_change_per_sample,
 719             std::string matrix_filename);
 720   // this constructor does not really initialize, use Init() or Read().
 721   NaturalGradientAffineComponent();
 722   virtual void Resize(int32 input_dim, int32 output_dim);
 723   virtual void InitFromConfig(ConfigLine *cfl);
 724   virtual std::string Info() const;
 725   virtual Component* Copy() const;
 726   virtual void Scale(BaseFloat scale);
 727   virtual void Add(BaseFloat alpha, const Component &other);
 728   // copy constructor
 729   explicit NaturalGradientAffineComponent(
 730       const NaturalGradientAffineComponent &other);
 731   virtual void ZeroStats();
 732
 733  private:
 734   // disallow assignment operator.
 735   NaturalGradientAffineComponent &operator= (
 736       const NaturalGradientAffineComponent&);
 737
 738   // Configs for preconditioner.  The input side tends to be better conditioned ->
 739   // smaller rank needed, so make them separately configurable.
 740   int32 rank_in_;
 741   int32 rank_out_;
 742   int32 update_period_;
 743   BaseFloat num_samples_history_;
 744   BaseFloat alpha_;
 745
 746   OnlineNaturalGradient preconditioner_in_;
 747
 748   OnlineNaturalGradient preconditioner_out_;
 749
 750   // If > 0, max_change_per_sample_ is the maximum amount of parameter
 751   // change (in L2 norm) that we allow per sample, averaged over the minibatch.
 752   // This was introduced in order to control instability.
 753   // Instead of the exact L2 parameter change, for
 754   // efficiency purposes we limit a bound on the exact
 755   // change.  The limit is applied via a constant <= 1.0
 756   // for each minibatch, A suitable value might be, for
 757   // example, 10 or so; larger if there are more
 758   // parameters.
 759   BaseFloat max_change_per_sample_;
 760
 761   // update_count_ records how many updates we have done.
 762   double update_count_;
 763
 764   // active_scaling_count_ records how many updates we have done,
 765   // where the scaling factor is active (not 1.0).
 766   double active_scaling_count_;
 767
 768   // max_change_scale_stats_ records the sum of scaling factors
 769   // in each update, so we can compute the averaged scaling factor
 770   // in Info().
 771   double max_change_scale_stats_;
 772
 773   // Sets the configs rank, alpha and eta in the preconditioner objects,
 774   // from the class variables.
 775   void SetNaturalGradientConfigs();
 776
 777   virtual void Update(
 778       const std::string &debug_info,
 779       const CuMatrixBase<BaseFloat> &in_value,
 780       const CuMatrixBase<BaseFloat> &out_deriv);
 781 };
 782
 783
 784 /// FixedAffineComponent is an affine transform that is supplied
 785 /// at network initialization time and is not trainable.
 786 class FixedAffineComponent: public Component {
 787  public:
 788   FixedAffineComponent() { }
 789   virtual std::string Type() const { return "FixedAffineComponent"; }
 790   virtual std::string Info() const;
 791
 792   // Copy constructor from AffineComponent-- can be used when we're done
 793   // training a particular part of the model and want to efficiently disable
 794   // further training.
 795   FixedAffineComponent(const AffineComponent &c);
 796
 797   /// matrix should be of size input-dim+1 to output-dim, last col is offset
 798   void Init(const CuMatrixBase<BaseFloat> &matrix);
 799
 800   // The ConfigLine cfl contains just the option matrix=<string>,
 801   // where the string is the filename of a Kaldi-format matrix to read.
 802   virtual void InitFromConfig(ConfigLine *cfl);
 803
 804   virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
 805   virtual int32 InputDim() const { return linear_params_.NumCols(); }
 806   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 807
 808   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 809                          const CuMatrixBase<BaseFloat> &in,
 810                          CuMatrixBase<BaseFloat> *out) const;
 811   virtual void Backprop(const std::string &debug_info,
 812                         const ComponentPrecomputedIndexes *indexes,
 813                         const CuMatrixBase<BaseFloat> &in_value,
 814                         const CuMatrixBase<BaseFloat> &, // out_value
 815                         const CuMatrixBase<BaseFloat> &out_deriv,
 816                         Component *to_update,
 817                         CuMatrixBase<BaseFloat> *in_deriv) const;
 818
 819
 820   virtual Component* Copy() const;
 821   virtual void Read(std::istream &is, bool binary);
 822   virtual void Write(std::ostream &os, bool binary) const;
 823
 824   // Function to provide access to linear_params_.
 825   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 826  protected:
 827   friend class AffineComponent;
 828   CuMatrix<BaseFloat> linear_params_;
 829   CuVector<BaseFloat> bias_params_;
 830
 831   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
 832 };
 833
 834 /// SumGroupComponent is used to sum up groups of posteriors.
 835 /// It's used to introduce a kind of Gaussian-mixture-model-like
 836 /// idea into neural nets.  This is basically a degenerate case of
 837 /// MixtureProbComponent; we had to implement it separately to
 838 /// be efficient for CUDA (we can use this one regardless whether
 839 /// we have CUDA or not; it's the normal case we want anyway).
 840 ///
 841 /// There are two forms of initialization in a config file: one
 842 /// where the number of elements are specified for each group
 843 /// individually as a vector, and one where only the total input
 844 /// dimension and the output dimension (number of groups) is specified.
 845 /// The second is used when all groups have the same size.
 846 class SumGroupComponent: public Component {
 847 public:
 848   virtual int32 InputDim() const { return input_dim_; }
 849   virtual int32 OutputDim() const { return output_dim_; }
 850   void Init(const std::vector<int32> &sizes); // the vector is of the input dim
 851                                               // (>= 1) for each output dim.
 852   void Init(int32 input_dim, int32 output_dim);
 853   void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
 854                                                   // each output-dim, how many
 855                                                   // inputs were summed over.
 856   virtual void InitFromConfig(ConfigLine *cfl);
 857   SumGroupComponent() { }
 858   virtual std::string Type() const { return "SumGroupComponent"; }
 859   virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
 860   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 861                          const CuMatrixBase<BaseFloat> &in,
 862                          CuMatrixBase<BaseFloat> *out) const;
 863   virtual void Backprop(const std::string &debug_info,
 864                         const ComponentPrecomputedIndexes *indexes,
 865                         const CuMatrixBase<BaseFloat> &in_value,
 866                         const CuMatrixBase<BaseFloat> &, // out_value
 867                         const CuMatrixBase<BaseFloat> &out_deriv,
 868                         Component *to_update,
 869                         CuMatrixBase<BaseFloat> *in_deriv) const;
 870   virtual Component* Copy() const;
 871   virtual void Read(std::istream &is, bool binary);
 872   virtual void Write(std::ostream &os, bool binary) const;
 873
 874 private:
 875   KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
 876   // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
 877   // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
 878   CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
 879                                // index.
 880   CuArray<int32> reverse_indexes_; // for each input index, the output index.
 881   int32 input_dim_;
 882   int32 output_dim_;
 883 };
 884
 885
 886 /// FixedScaleComponent applies a fixed per-element scale; it's similar
 887 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
 888 /// model conversion).
 889 class FixedScaleComponent: public Component {
 890  public:
 891   FixedScaleComponent() { }
 892   virtual std::string Type() const { return "FixedScaleComponent"; }
 893   virtual std::string Info() const;
 894   virtual int32 Properties() const {
 895     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
 896   }
 897
 898   void Init(const CuVectorBase<BaseFloat> &scales);
 899
 900   // The ConfigLine cfl contains only the option scales=<string>,
 901   // where the string is the filename of a Kaldi-format matrix to read.
 902   virtual void InitFromConfig(ConfigLine *cfl);
 903
 904   virtual int32 InputDim() const { return scales_.Dim(); }
 905   virtual int32 OutputDim() const { return scales_.Dim(); }
 906
 907   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 908                          const CuMatrixBase<BaseFloat> &in,
 909                          CuMatrixBase<BaseFloat> *out) const;
 910   virtual void Backprop(const std::string &debug_info,
 911                         const ComponentPrecomputedIndexes *indexes,
 912                         const CuMatrixBase<BaseFloat> &, // in_value
 913                         const CuMatrixBase<BaseFloat> &, // out_value
 914                         const CuMatrixBase<BaseFloat> &out_deriv,
 915                         Component *, // to_update
 916                         CuMatrixBase<BaseFloat> *in_deriv) const;
 917   virtual Component* Copy() const;
 918   virtual void Read(std::istream &is, bool binary);
 919   virtual void Write(std::ostream &os, bool binary) const;
 920
 921  protected:
 922   friend class AffineComponent;  // necessary for collapse
 923   CuVector<BaseFloat> scales_;
 924   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
 925 };
 926
 927
 928 /// FixedBiasComponent applies a fixed per-element bias; it's similar
 929 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
 930 /// model conversion.
 931 class FixedBiasComponent: public Component {
 932  public:
 933   FixedBiasComponent() { }
 934   virtual std::string Type() const { return "FixedBiasComponent"; }
 935   virtual std::string Info() const;
 936
 937   virtual int32 Properties() const {
 938     return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
 939   }
 940
 941   void Init(const CuVectorBase<BaseFloat> &scales);
 942
 943   // The ConfigLine cfl contains only the option bias=<string>,
 944   // where the string is the filename of a Kaldi-format matrix to read.
 945   virtual void InitFromConfig(ConfigLine *cfl);
 946   virtual int32 InputDim() const { return bias_.Dim(); }
 947   virtual int32 OutputDim() const { return bias_.Dim(); }
 948   using Component::Propagate; // to avoid name hiding
 949   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 950                          const CuMatrixBase<BaseFloat> &in,
 951                          CuMatrixBase<BaseFloat> *out) const;
 952   virtual void Backprop(const std::string &debug_info,
 953                         const ComponentPrecomputedIndexes *indexes,
 954                         const CuMatrixBase<BaseFloat> &, // in_value,
 955                         const CuMatrixBase<BaseFloat> &, // out_value
 956                         const CuMatrixBase<BaseFloat> &out_deriv,
 957                         Component *, // to_update
 958                         CuMatrixBase<BaseFloat> *in_deriv) const;
 959   virtual Component* Copy() const;
 960   virtual void Read(std::istream &is, bool binary);
 961   virtual void Write(std::ostream &os, bool binary) const;
 962
 963  protected:
 964   CuVector<BaseFloat> bias_;
 965   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
 966 };
 967
 968 // NoOpComponent just duplicates its input.  We don't anticipate this being used
 969 // very often, but it may sometimes make your life easier
 970 class NoOpComponent: public NonlinearComponent {
 971  public:
 972   explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
 973   NoOpComponent() { }
 974   virtual std::string Type() const { return "NoOpComponent"; }
 975   virtual int32 Properties() const {
 976     return kSimpleComponent|kLinearInInput|kPropagateInPlace;
 977   }
 978   virtual Component* Copy() const { return new NoOpComponent(*this); }
 979   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 980                          const CuMatrixBase<BaseFloat> &in,
 981                          CuMatrixBase<BaseFloat> *out) const;
 982   virtual void Backprop(const std::string &debug_info,
 983                         const ComponentPrecomputedIndexes *indexes,
 984                         const CuMatrixBase<BaseFloat> &, //in_value
 985                         const CuMatrixBase<BaseFloat> &, // out_value,
 986                         const CuMatrixBase<BaseFloat> &out_deriv,
 987                         Component *to_update,
 988                         CuMatrixBase<BaseFloat> *in_deriv) const;
 989  private:
 990   NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
 991 };
 992
 993 // ClipGradientComponent just duplicates its input, but clips gradients
 994 // during backpropagation if they cross a predetermined threshold.
 995 // This component will be used to prevent gradient explosion problem in
 996 // recurrent neural networks
 997 class ClipGradientComponent: public Component {
 998  public:
 999   ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1000                         bool norm_based_clipping,
1001                         BaseFloat self_repair_clipped_proportion_threshold,
1002                         BaseFloat self_repair_target,
1003                         BaseFloat self_repair_scale,
1004                         int32 num_clipped,
1005                         int32 count,
1006                         int32 num_self_repaired,
1007                         int32 num_backpropped) {
1008     Init(dim, clipping_threshold, norm_based_clipping,
1009          self_repair_clipped_proportion_threshold,
1010          self_repair_target,
1011          self_repair_scale,
1012          num_clipped, count,
1013          num_self_repaired, num_backpropped);}
1014
1015   ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1016     norm_based_clipping_(false),
1017     self_repair_clipped_proportion_threshold_(1.0),
1018     self_repair_target_(0.0),
1019     self_repair_scale_(0.0),
1020     num_clipped_(0), count_(0),
1021     num_self_repaired_(0), num_backpropped_(0) { }
1022
1023   virtual int32 InputDim() const { return dim_; }
1024   virtual int32 OutputDim() const { return dim_; }
1025   virtual void InitFromConfig(ConfigLine *cfl);
1026   void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1027             BaseFloat self_repair_clipped_proportion_threshold,
1028             BaseFloat self_repair_target,
1029             BaseFloat self_repair_scale,
1030             int32 num_clipped, int32 count,
1031             int32 num_self_repaired, int32 num_backpropped);
1032
1033   virtual std::string Type() const { return "ClipGradientComponent"; }
1034
1035   virtual int32 Properties() const {
1036     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1037            kBackpropNeedsInput;
1038   }
1039
1040   virtual void ZeroStats();
1041
1042   virtual Component* Copy() const {
1043     return new ClipGradientComponent(dim_,
1044                                      clipping_threshold_,
1045                                      norm_based_clipping_,
1046                                      self_repair_clipped_proportion_threshold_,
1047                                      self_repair_target_,
1048                                      self_repair_scale_,
1049                                      num_clipped_,
1050                                      count_,
1051                                      num_self_repaired_,
1052                                      num_backpropped_);}
1053
1054   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1055                          const CuMatrixBase<BaseFloat> &in,
1056                          CuMatrixBase<BaseFloat> *out) const;
1057   virtual void Backprop(const std::string &debug_info,
1058                         const ComponentPrecomputedIndexes *indexes,
1059                         const CuMatrixBase<BaseFloat> &in_value,
1060                         const CuMatrixBase<BaseFloat> &, // out_value,
1061                         const CuMatrixBase<BaseFloat> &out_deriv,
1062                         Component *to_update,
1063                         CuMatrixBase<BaseFloat> *in_deriv) const;
1064
1065   virtual void Scale(BaseFloat scale);
1066   virtual void Add(BaseFloat alpha, const Component &other);
1067   virtual void Read(std::istream &is, bool binary); // This Read function
1068   // requires that the Component has the correct type.
1069   /// Write component to stream
1070   virtual void Write(std::ostream &os, bool binary) const;
1071   virtual std::string Info() const;
1072   virtual ~ClipGradientComponent() {
1073     if (num_self_repaired_ > 0)
1074       KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1075                 << ")'s self-repair was activated " << num_self_repaired_
1076                 << " time(s) out of " << num_backpropped_
1077                 << " times of calling Backprop() in this training job.";
1078   }
1079  private:
1080   int32 dim_;  // input/output dimension
1081   BaseFloat clipping_threshold_;  // threshold to be used for clipping
1082                                   // could correspond to max-row-norm (if
1083                                   // norm_based_clipping_ == true) or
1084                                   // max-absolute-value (otherwise)
1085   bool norm_based_clipping_;  // if true the max-row-norm will be clipped
1086                               // else element-wise absolute value clipping is
1087                               // done
1088
1089   // some configuration values relating to self-repairing.
1090   BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1091                                                        // clipped-proportion
1092                                                        // for self-repair to be
1093                                                        // activated
1094   BaseFloat self_repair_target_; // the target value towards which self-repair
1095                                  // is trying to set for in-deriv
1096   BaseFloat self_repair_scale_;  // constant scaling the self-repair vector
1097   std::string debug_info_;   // component-node name, used in the destructor to
1098                              // print out stats of self-repair
1099
1100   // this function is called from Backprop code, and only does something if the
1101   // self-repair-scale config value is set and the current clipped proportion
1102   // exceeds the threshold. What it does is to add a term to in-deriv that
1103   // forces the input to the ClipGradientComponent to be close to some small
1104   // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1105   // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1106   // small, the parameters on the path will also tend to be small, which may
1107   // help tamp down the divergence caused by gradient explosion.
1108   void RepairGradients(const std::string &debug_info,
1109                        const CuMatrixBase<BaseFloat> &in_value,
1110                        CuMatrixBase<BaseFloat> *in_deriv,
1111                        ClipGradientComponent *to_update) const;
1112
1113   ClipGradientComponent &operator =
1114       (const ClipGradientComponent &other); // Disallow.
1115
1116  protected:
1117   // variables to store stats
1118   // An element corresponds to rows of derivative matrix, when
1119   // norm_based_clipping_ is true,
1120   // else it corresponds to each element of the derivative matrix
1121   // Note: no stats are stored when norm_based_clipping_ is false
1122   int32 num_clipped_;  // number of elements which were clipped
1123   int32 count_;  // number of elements which were processed
1124   int32 num_self_repaired_; // number of times self-repair is activated
1125   int32 num_backpropped_; //number of times backprop is called
1126
1127 };
1128
1129 /** PermuteComponent changes the order of the columns (i.e. the feature or
1130     activation dimensions).  Output dimension i is mapped to input dimension
1131     column_map_[i], so it's like doing:
1132       for each row:
1133         for each feature/activation dimension i:
1134           output(row, i) = input(row, column_map_[i]).
1135
1136 */
1137 class PermuteComponent: public Component {
1138  public:
1139   PermuteComponent()  {}
1140   PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1141
1142   virtual int32 InputDim() const { return column_map_.Dim(); }
1143   virtual int32 OutputDim() const { return column_map_.Dim(); }
1144   virtual void InitFromConfig(ConfigLine *cfl);
1145   void Init(const std::vector<int32> &column_map);
1146
1147   virtual std::string Type() const { return "PermuteComponent"; }
1148
1149   virtual int32 Properties() const {
1150     return kSimpleComponent|kLinearInInput;
1151   }
1152
1153   virtual void ZeroStats() {}
1154
1155   virtual Component* Copy() const;
1156
1157   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1158                          const CuMatrixBase<BaseFloat> &in,
1159                          CuMatrixBase<BaseFloat> *out) const;
1160   virtual void Backprop(const std::string &debug_info,
1161                         const ComponentPrecomputedIndexes *indexes,
1162                         const CuMatrixBase<BaseFloat> &, //in_value
1163                         const CuMatrixBase<BaseFloat> &, // out_value,
1164                         const CuMatrixBase<BaseFloat> &out_deriv,
1165                         Component *to_update,
1166                         CuMatrixBase<BaseFloat> *in_deriv) const;
1167
1168   virtual void Scale(BaseFloat scale) {}
1169   virtual void Add(BaseFloat alpha, const Component &other) {}
1170   virtual void Read(std::istream &is, bool binary); // This Read function
1171   // requires that the Component has the correct type.
1172   /// Write component to stream
1173   virtual void Write(std::ostream &os, bool binary) const;
1174   virtual std::string Info() const;
1175  private:
1176   // computes the reverse column map.  Must not be called if column_map_.Dim()
1177   // == 0
1178   void ComputeReverseColumnMap();
1179   CuArray<int32> column_map_;
1180   // the following is a derived variable, not written to disk.
1181   // It is used in backprop.
1182   CuArray<int32> reverse_column_map_;
1183   PermuteComponent &operator =
1184       (const PermuteComponent &other); // Disallow.
1185 };
1186
1187
1188
1189
1190 // PerElementScaleComponent scales each dimension of its input with a separate
1191 // trainable scale; it's like a linear component with a diagonal matrix.
1192 class PerElementScaleComponent: public UpdatableComponent {
1193  public:
1194   virtual int32 InputDim() const { return scales_.Dim(); }
1195   virtual int32 OutputDim() const { return scales_.Dim(); }
1196
1197   virtual std::string Info() const;
1198   virtual void InitFromConfig(ConfigLine *cfl);
1199
1200   PerElementScaleComponent() { } // use Init to really initialize.
1201   virtual std::string Type() const { return "PerElementScaleComponent"; }
1202   virtual int32 Properties() const {
1203     return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1204         kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1205   }
1206
1207   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1208                          const CuMatrixBase<BaseFloat> &in,
1209                          CuMatrixBase<BaseFloat> *out) const;
1210   virtual void Backprop(const std::string &debug_info,
1211                         const ComponentPrecomputedIndexes *indexes,
1212                         const CuMatrixBase<BaseFloat> &in_value,
1213                         const CuMatrixBase<BaseFloat> &, // out_value
1214                         const CuMatrixBase<BaseFloat> &out_deriv,
1215                         Component *to_update,
1216                         CuMatrixBase<BaseFloat> *in_deriv) const;
1217
1218   virtual void Read(std::istream &is, bool binary);
1219   virtual void Write(std::ostream &os, bool binary) const;
1220
1221   virtual Component* Copy() const;
1222
1223
1224   // Some functions from base-class UpdatableComponent.
1225   virtual void Scale(BaseFloat scale);
1226   virtual void Add(BaseFloat alpha, const Component &other);
1227   virtual void PerturbParams(BaseFloat stddev);
1228   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1229   virtual int32 NumParameters() const;
1230   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1231   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1232
1233   // Some functions that are specific to this class.
1234   explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1235
1236   void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1237   void Init(std::string vector_filename);
1238
1239  protected:
1240   friend class AffineComponent;  // necessary for collapse
1241   // This function Update() is for extensibility; child classes may override
1242   // this, e.g. for natural gradient update.
1243   virtual void Update(
1244       const std::string &debug_info,
1245       const CuMatrixBase<BaseFloat> &in_value,
1246       const CuMatrixBase<BaseFloat> &out_deriv) {
1247     UpdateSimple(in_value, out_deriv);
1248   }
1249   // UpdateSimple is used when *this is a gradient.  Child classes may override
1250   // this if needed, but typically won't need to.
1251   virtual void UpdateSimple(
1252       const CuMatrixBase<BaseFloat> &in_value,
1253       const CuMatrixBase<BaseFloat> &out_deriv);
1254
1255   const PerElementScaleComponent &operator
1256       = (const PerElementScaleComponent &other); // Disallow.
1257   CuVector<BaseFloat> scales_;
1258 };
1259
1260
1261 // PerElementOffsetComponent offsets each dimension of its input with a separate
1262 // trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
1263 class PerElementOffsetComponent: public UpdatableComponent {
1264  public:
1265   virtual int32 InputDim() const { return offsets_.Dim(); }
1266   virtual int32 OutputDim() const { return offsets_.Dim(); }
1267
1268   virtual std::string Info() const;
1269   virtual void InitFromConfig(ConfigLine *cfl);
1270
1271   PerElementOffsetComponent() { } // use Init to really initialize.
1272   virtual std::string Type() const { return "PerElementOffsetComponent"; }
1273   virtual int32 Properties() const {
1274     return kSimpleComponent|kUpdatableComponent|
1275            kBackpropInPlace|kPropagateInPlace;
1276   }
1277
1278   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1279                          const CuMatrixBase<BaseFloat> &in,
1280                          CuMatrixBase<BaseFloat> *out) const;
1281   virtual void Backprop(const std::string &debug_info,
1282                         const ComponentPrecomputedIndexes *indexes,
1283                         const CuMatrixBase<BaseFloat> &, // in_value
1284                         const CuMatrixBase<BaseFloat> &, // out_value
1285                         const CuMatrixBase<BaseFloat> &out_deriv,
1286                         Component *to_update,
1287                         CuMatrixBase<BaseFloat> *in_deriv) const;
1288
1289   virtual void Read(std::istream &is, bool binary);
1290   virtual void Write(std::ostream &os, bool binary) const;
1291
1292   virtual Component* Copy() const;
1293
1294
1295   // Some functions from base-class UpdatableComponent.
1296   virtual void Scale(BaseFloat scale);
1297   virtual void Add(BaseFloat alpha, const Component &other);
1298   virtual void PerturbParams(BaseFloat stddev);
1299   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1300   virtual int32 NumParameters() const;
1301   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1302   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1303
1304   // Some functions that are specific to this class.
1305   explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1306
1307   void Init(int32 dim, BaseFloat param_mean,
1308             BaseFloat param_stddev);
1309   void Init(std::string vector_filename);
1310
1311  protected:
1312   const PerElementOffsetComponent &operator
1313       = (const PerElementOffsetComponent &other); // Disallow.
1314   CuVector<BaseFloat> offsets_;
1315 };
1316
1317
1318 // ConstantFunctionComponent returns constant function of its input,
1319 // i.e. its output does not depend on its input.  It is the same as
1320 // an affine component with the linear term fixed at zero.
1321 // It is optionally trainable, and optionally you can use natural
1322 // gradient.  The input is required only because it's more convenient
1323 // to make SimpleComponents [but see ConstantComponent, which requires
1324 // no inputs].
1325 class ConstantFunctionComponent: public UpdatableComponent {
1326  public:
1327   virtual int32 InputDim() const { return input_dim_; }
1328   virtual int32 OutputDim() const { return output_.Dim(); }
1329
1330   virtual std::string Info() const;
1331   // possible parameter values with their defaults:
1332   // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1333   // output-mean=0 output-stddev=0
1334   virtual void InitFromConfig(ConfigLine *cfl);
1335
1336   ConstantFunctionComponent();
1337
1338   ConstantFunctionComponent(const ConstantFunctionComponent &other);
1339
1340   virtual std::string Type() const { return "ConstantFunctionComponent"; }
1341   virtual int32 Properties() const {
1342     return kSimpleComponent|
1343         (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1344         (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1345         kBackpropAdds;
1346   }
1347   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1348                          const CuMatrixBase<BaseFloat> &in,
1349                          CuMatrixBase<BaseFloat> *out) const;
1350   virtual void Backprop(const std::string &debug_info,
1351                         const ComponentPrecomputedIndexes *indexes,
1352                         const CuMatrixBase<BaseFloat> &, // in_value
1353                         const CuMatrixBase<BaseFloat> &, // out_value
1354                         const CuMatrixBase<BaseFloat> &out_deriv,
1355                         Component *to_update,
1356                         CuMatrixBase<BaseFloat> *in_deriv) const;
1357
1358   virtual void Read(std::istream &is, bool binary);
1359   virtual void Write(std::ostream &os, bool binary) const;
1360
1361   virtual Component* Copy() const;
1362
1363   // Some functions from base-class UpdatableComponent.
1364   virtual void Scale(BaseFloat scale);
1365   virtual void Add(BaseFloat alpha, const Component &other);
1366   virtual void PerturbParams(BaseFloat stddev);
1367   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1368   virtual int32 NumParameters() const;
1369   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1370   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1371  private:
1372   int32 input_dim_;
1373   // the output value-- a vector.
1374   CuVector<BaseFloat> output_;
1375
1376   bool is_updatable_;
1377   // if true, and if updatable, do natural-gradient update.
1378   bool use_natural_gradient_;
1379   OnlineNaturalGradient preconditioner_;
1380
1381   const ConstantFunctionComponent &operator
1382   = (const ConstantFunctionComponent &other); // Disallow.
1383 };
1384
1385
1386
1387 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1388 // it uses a natural gradient update for the per-element scales, and enforces a
1389 // maximum amount of change per minibatch, for stability.
1390 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1391  public:
1392
1393   virtual std::string Info() const;
1394
1395   virtual void InitFromConfig(ConfigLine *cfl);
1396
1397   NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1398   virtual std::string Type() const {
1399     return "NaturalGradientPerElementScaleComponent";
1400   }
1401
1402   virtual void Read(std::istream &is, bool binary);
1403   virtual void Write(std::ostream &os, bool binary) const;
1404
1405   virtual Component* Copy() const;
1406
1407   // Some functions that are specific to this class:
1408   explicit NaturalGradientPerElementScaleComponent(
1409       const NaturalGradientPerElementScaleComponent &other);
1410
1411   void Init(int32 dim, BaseFloat param_mean,
1412             BaseFloat param_stddev, int32 rank, int32 update_period,
1413             BaseFloat num_samples_history, BaseFloat alpha,
1414             BaseFloat max_change_per_minibatch);
1415   void Init(std::string vector_filename,
1416             int32 rank, int32 update_period, BaseFloat num_samples_history,
1417             BaseFloat alpha, BaseFloat max_change_per_minibatch);
1418
1419  private:
1420   // configuration value for imposing max-change...
1421   BaseFloat max_change_per_minibatch_;
1422
1423   // unlike the NaturalGradientAffineComponent, there is only one dimension to
1424   // consider as the parameters are a vector not a matrix, so we only need one
1425   // preconditioner.
1426   // The preconditioner stores its own configuration values; we write and read
1427   // these, but not the preconditioner object itself.
1428   OnlineNaturalGradient preconditioner_;
1429
1430   // Override of the parent-class Update() function, called only
1431   // if this->is_gradient_ = false; this implements the natural
1432   // gradient update.
1433   virtual void Update(
1434       const std::string &debug_info,
1435       const CuMatrixBase<BaseFloat> &in_value,
1436       const CuMatrixBase<BaseFloat> &out_deriv);
1437
1438   const NaturalGradientPerElementScaleComponent &operator
1439       = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1440 };
1441
1442 /**
1443  * ConvolutionalComponent implements 2d-convolution.
1444  * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1445  * 2 dimensions as it has same size as the input along the 3rd dimension.
1446  * Input : A matrix where each row is a  vectorized 3D-tensor.
1447  *        The 3D tensor has dimensions
1448  *        x: (e.g. time)
1449  *        y: (e.g. frequency)
1450  *        z: (e.g. channels like features/delta/delta-delta)
1451  *
1452  *        The component supports input vectorizations of type zyx and yzx.
1453  *        The default vectorization type is zyx.
1454  *        e.g. for input vectorization of type zyx the input is vectorized by
1455  *        spanning axes z, y and x of the tensor in that order.
1456  *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1457  *        the zyx vectorized input looks like
1458  *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1459  *
1460  *
1461  * Output : The output is also a 3D tensor vectorized in the zyx format.
1462  *          The channel axis (z) in the output corresponds to the output of
1463  *          different filters. The first channel corresponds to the first filter
1464  *          i.e., first row of the filter_params_ matrix.
1465  *
1466  * Note: The component has to support yzx input vectorization as the binaries
1467  * like add-deltas generate yz vectorized output. These input vectors are
1468  * concatenated using the Append descriptor across time steps to form a yzx
1469  * vectorized 3D tensor input.
1470  * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1471  *
1472  *
1473  * For information on the hyperparameters and parameters of this component see
1474  * the variable declarations.
1475  *
1476  * Propagation:
1477  * ------------
1478  * Convolution operation consists of a dot-products between the filter tensor
1479  * and input tensor patch, for various shifts of filter tensor along the x and y
1480  * axes input tensor. (Note: there is no shift along z-axis as the filter and
1481  * input tensor have same size along this axis).
1482  *
1483  * For a particular shift (i,j) of the filter tensor
1484  * along input tensor dimensions x and y, the elements of the input tensor which
1485  * overlap with the filter form the input tensor patch. This patch is vectorized
1486  * in zyx format. All the patches corresponding to various samples in the
1487  * mini-batch are stacked into a matrix, where each row corresponds to one
1488  * patch. Let this matrix be represented by X_{i,j}. The dot products with
1489  * various filters are computed simultaneously by computing the matrix product
1490  * with the filter_params_ matrix (W)
1491  * Y_{i,j} = X_{i,j}*W^T.
1492  * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1493  *
1494  * All the matrix products corresponding to various shifts (i,j) of the
1495  * filter tensor are computed simultaneously using the AddMatMatBatched
1496  * call of CuMatrixBase class.
1497  *
1498  * BackPropagation:
1499  * ----------------
1500  *  Backpropagation to compute the input derivative (\nabla X_{i,j})
1501  *  consists of the a series of matrix products.
1502  *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1503  *   output derivative for a particular shift of the filter.
1504  *
1505  *   Once again these matrix products are computed simultaneously.
1506  *
1507  * Update:
1508  * -------
1509  *  The weight gradient is computed as
1510  *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1511  *
1512  */
1513 class ConvolutionComponent: public UpdatableComponent {
1514  public:
1515   enum TensorVectorizationType  {
1516     kYzx = 0,
1517     kZyx = 1
1518   };
1519
1520   ConvolutionComponent();
1521   // constructor using another component
1522   ConvolutionComponent(const ConvolutionComponent &component);
1523   // constructor using parameters
1524   ConvolutionComponent(
1525     const CuMatrixBase<BaseFloat> &filter_params,
1526     const CuVectorBase<BaseFloat> &bias_params,
1527     int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1528     int32 filt_x_dim, int32 filt_y_dim,
1529     int32 filt_x_step, int32 filt_y_step,
1530     TensorVectorizationType input_vectorization,
1531     BaseFloat learning_rate);
1532
1533   virtual int32 InputDim() const;
1534   virtual int32 OutputDim() const;
1535
1536   virtual std::string Info() const;
1537   virtual void InitFromConfig(ConfigLine *cfl);
1538   virtual std::string Type() const { return "ConvolutionComponent"; }
1539   virtual int32 Properties() const {
1540     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1541            kBackpropAdds|kPropagateAdds;
1542   }
1543
1544   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1545                          const CuMatrixBase<BaseFloat> &in,
1546                          CuMatrixBase<BaseFloat> *out) const;
1547   virtual void Backprop(const std::string &debug_info,
1548                         const ComponentPrecomputedIndexes *indexes,
1549                         const CuMatrixBase<BaseFloat> &in_value,
1550                         const CuMatrixBase<BaseFloat> &, // out_value,
1551                         const CuMatrixBase<BaseFloat> &out_deriv,
1552                         Component *to_update_in,
1553                         CuMatrixBase<BaseFloat> *in_deriv) const;
1554   void Update(const std::string &debug_info,
1555               const CuMatrixBase<BaseFloat> &in_value,
1556               const CuMatrixBase<BaseFloat> &out_deriv,
1557               const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1558
1559
1560   virtual void Read(std::istream &is, bool binary);
1561   virtual void Write(std::ostream &os, bool binary) const;
1562
1563   virtual Component* Copy() const;
1564
1565   // Some functions from base-class UpdatableComponent.
1566   virtual void Scale(BaseFloat scale);
1567   virtual void Add(BaseFloat alpha, const Component &other);
1568   virtual void PerturbParams(BaseFloat stddev);
1569   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1570   virtual int32 NumParameters() const;
1571   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1572   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1573
1574   // Some functions that are specific to this class.
1575   void SetParams(const VectorBase<BaseFloat> &bias,
1576                  const MatrixBase<BaseFloat> &filter);
1577   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1578   const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1579   void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1580             int32 filt_x_dim, int32 filt_y_dim,
1581             int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1582             TensorVectorizationType input_vectorization,
1583             BaseFloat param_stddev, BaseFloat bias_stddev);
1584   // there is no filt_z_dim parameter as the length of the filter along
1585   // z-dimension is same as the input
1586   void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1587             int32 filt_x_dim, int32 filt_y_dim,
1588             int32 filt_x_step, int32 filt_y_step,
1589             TensorVectorizationType input_vectorization,
1590             std::string matrix_filename);
1591
1592   // resize the component, setting the parameters to zero, while
1593   // leaving any other configuration values the same
1594   void Resize(int32 input_dim, int32 output_dim);
1595
1596   void Update(const std::string &debug_info,
1597               const CuMatrixBase<BaseFloat> &in_value,
1598               const CuMatrixBase<BaseFloat> &out_deriv);
1599
1600
1601  private:
1602   int32 input_x_dim_;   // size of the input along x-axis
1603                         // (e.g. number of time steps)
1604
1605   int32 input_y_dim_;   // size of input along y-axis
1606                         // (e.g. number of mel-frequency bins)
1607
1608   int32 input_z_dim_;   // size of input along z-axis
1609                         // (e.g. number of channels is 3 if the input has
1610                         // features + delta + delta-delta features
1611
1612   int32 filt_x_dim_;    // size of the filter along x-axis
1613
1614   int32 filt_y_dim_;    // size of the filter along y-axis
1615
1616   // there is no filt_z_dim_ as it is always assumed to be
1617   // the same as input_z_dim_
1618
1619   int32 filt_x_step_;   // the number of steps taken along x-axis of input
1620                         //  before computing the next dot-product
1621                         //  of filter and input
1622
1623   int32 filt_y_step_;   // the number of steps taken along y-axis of input
1624                         // before computing the next dot-product of the filter
1625                         // and input
1626
1627   // there is no filt_z_step_ as only dot product is possible along this axis
1628
1629   TensorVectorizationType input_vectorization_; // type of vectorization of the
1630   // input 3D tensor. Accepts zyx and yzx formats
1631
1632   CuMatrix<BaseFloat> filter_params_;
1633   // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1634   // where each row in the matrix corresponds to one filter.
1635   // The 3D filter tensor is vectorizedin zyx format.
1636   // The first row of the matrix corresponds to the first filter and so on.
1637   // Keep in mind the vectorization type and order of filters when using file
1638   // based initialization.
1639
1640   CuVector<BaseFloat> bias_params_;
1641   // the filter-specific bias vector (i.e., there is a seperate bias added
1642   // to the output of each filter).
1643   bool is_gradient_;
1644
1645   void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1646                            CuMatrix<BaseFloat> *patches) const;
1647   void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1648                                CuMatrixBase<BaseFloat> *in_deriv) const;
1649   const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1650 };
1651
1652
1653 // LstmNonlinearityComponent is a component that implements part of an LSTM, by
1654 // combining together the sigmoids and tanh's, plus some diagonal terms, into
1655 // a single block.
1656 // We will refer to the LSTM formulation used in
1657 //
1658 // Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1659 // by H. Sak et al,
1660 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1661 //
1662 // Suppose the cell dimension is C.  Then outside this component, we compute
1663 // the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1664 // matrix multiplication:
1665 //
1666 // i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1667 // f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1668 // c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1669 // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1670 //
1671 // The part of the computation that takes place in this component is as follows.
1672 // Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
1673 // c_{t-1}).  Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1674 //
1675 // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1676 //
1677 //
1678 // This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1679 // and w_o.
1680 //
1681 //
1682 // In the forward pass (Propagate), this component computes the following:
1683 //
1684 //    i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
1685 //    f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
1686 //    c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
1687 //    o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
1688 //    m_t = o_t * Tanh(c_t)                    (5)
1689 //   # note: the outputs are just c_t and m_t.
1690 //
1691 // The backprop is as you would think, but for the "self-repair" we need to pass
1692 // in additional vectors (of the same dim as the parameters of the layer) that
1693 // dictate whether or not we add an additional term to the backpropagated
1694 // derivatives.  (This term helps force the input to the nonlinearities into the
1695 // range where the derivatives are not too small).
1696 //
1697 // This component stores stats of the same form as are normally stored by the
1698 // StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1699 // activations and derivatives, but this is done inside the Backprop() functions.
1700 // [the StoreStats() functions don't take the input data as an argument, so
1701 // storing this data that way is impossible, and anyway it's more efficient to
1702 // do it as part of backprop.]
1703 class LstmNonlinearityComponent: public UpdatableComponent {
1704  public:
1705
1706   virtual int32 InputDim() const;
1707   virtual int32 OutputDim() const;
1708   virtual std::string Info() const;
1709   virtual void InitFromConfig(ConfigLine *cfl);
1710   LstmNonlinearityComponent() { } // use Init to really initialize.
1711   virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1712   virtual int32 Properties() const {
1713     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1714   }
1715
1716   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1717                          const CuMatrixBase<BaseFloat> &in,
1718                          CuMatrixBase<BaseFloat> *out) const;
1719   virtual void Backprop(const std::string &debug_info,
1720                         const ComponentPrecomputedIndexes *indexes,
1721                         const CuMatrixBase<BaseFloat> &in_value,
1722                         const CuMatrixBase<BaseFloat> &, // out_value,
1723                         const CuMatrixBase<BaseFloat> &out_deriv,
1724                         Component *to_update_in,
1725                         CuMatrixBase<BaseFloat> *in_deriv) const;
1726
1727   virtual void Read(std::istream &is, bool binary);
1728   virtual void Write(std::ostream &os, bool binary) const;
1729
1730   virtual Component* Copy() const;
1731
1732   // Some functions from base-class UpdatableComponent.
1733   virtual void Scale(BaseFloat scale);
1734   virtual void Add(BaseFloat alpha, const Component &other);
1735   virtual void PerturbParams(BaseFloat stddev);
1736   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1737   virtual int32 NumParameters() const;
1738   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1739   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1740   virtual void ZeroStats();
1741
1742   // Some functions that are specific to this class:
1743   explicit LstmNonlinearityComponent(
1744       const LstmNonlinearityComponent &other);
1745
1746   void Init(int32 cell_dim, BaseFloat param_stddev,
1747             BaseFloat tanh_self_repair_threshold,
1748             BaseFloat sigmoid_self_repair_threshold,
1749             BaseFloat self_repair_scale);
1750
1751   void Init(std::string vector_filename,
1752             int32 rank, int32 update_period, BaseFloat num_samples_history,
1753             BaseFloat alpha, BaseFloat max_change_per_minibatch);
1754
1755  private:
1756
1757   // Initializes the natural-gradient object with the configuration we
1758   // use for this object, which for now is hardcoded at the C++ level.
1759   void InitNaturalGradient();
1760
1761
1762   // Notation: C is the cell dimension; it equals params_.NumCols().
1763
1764   // The dimension of the parameter matrix is (3 x C);
1765   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1766   CuMatrix<BaseFloat> params_;
1767
1768   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1769   // equations (1) through (5), this is the sum of the values of the nonliearities
1770   // (used for diagnostics only).  It is comparable to value_sum_ vector
1771   // in base-class NonlinearComponent.
1772   CuMatrix<double> value_sum_;
1773
1774   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1775   // equations (1) through (5), this is the sum of the derivatives of the
1776   // nonliearities (used for diagnostics and to control self-repair).  It is
1777   // comparable to the deriv_sum_ vector in base-class
1778   // NonlinearComponent.
1779   CuMatrix<double> deriv_sum_;
1780
1781   // This matrix has dimension 10.  The contents are a block of 5 self-repair
1782   // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1783   // self-repair scales (typically all 0.00001).  These are for each of the 5
1784   // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1785   // more info).
1786   CuVector<BaseFloat> self_repair_config_;
1787
1788   // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
1789   // component (see comments in cu-math.h for more info), it contains the total,
1790   // over all frames represented in count_, of the number of dimensions that
1791   // were subject to self_repair.  To get the self-repair proportion you should
1792   // divide by (count_ times cell_dim_).
1793   CuVector<double> self_repair_total_;
1794
1795   // The total count (number of frames) corresponding to the stats in value_sum_
1796   // and deriv_sum_.
1797   double count_;
1798
1799   // Preconditioner for the parameters of this component [operates in the space
1800   // of dimension C].
1801   // The preconditioner stores its own configuration values; we write and read
1802   // these, but not the preconditioner object itself.
1803   OnlineNaturalGradient preconditioner_;
1804
1805   const LstmNonlinearityComponent &operator
1806       = (const LstmNonlinearityComponent &other); // Disallow.
1807 };
1808
1809
1810
1811
1812 /*
1813  * MaxPoolingComponent :
1814  * Maxpooling component was firstly used in ConvNet for selecting an
1815  * representative activation in an area. It inspired Maxout nonlinearity.
1816  * Each output element of this component is the maximum of a block of
1817  * input elements where the block has a 3D dimension (pool_x_size_,
1818  * pool_y_size_, pool_z_size_).
1819  * Blocks could overlap if the shift value on any axis is smaller
1820  * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
1821  * If the shift values are euqal to their pool size, there is no
1822  * overlap; while if they all equal 1, the blocks overlap to
1823  * the greatest possible extent.
1824  *
1825  * This component is designed to be used after a ConvolutionComponent
1826  * so that the input matrix is propagated from a 2d-convolutional layer.
1827  * This component implements 3d-maxpooling which performs
1828  * max pooling along the three axes.
1829  * Input : A matrix where each row is a vectorized 3D-tensor.
1830  *        The 3D tensor has dimensions
1831  *        x: (e.g. time)
1832  *        y: (e.g. frequency)
1833  *        z: (e.g. channels like number of filters in the ConvolutionComponent)
1834  *
1835  *        The component assumes input vectorizations of type zyx
1836  *        which is the default output vectorization type of a ConvolutionComponent.
1837  *        e.g. for input vectorization of type zyx the input is vectorized by
1838  *        spanning axes z, y and x of the tensor in that order.
1839  *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1840  *        the zyx vectorized input looks like
1841  *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1842  *
1843  * Output : The output is also a 3D tensor vectorized in the zyx format.
1844  *
1845  * For information on the hyperparameters and parameters of this component see
1846  * the variable declarations.
1847  *
1848  *
1849  */
1850
1851 class MaxpoolingComponent: public Component {
1852  public:
1853
1854   MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
1855                            pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
1856                            pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
1857   // constructor using another component
1858   MaxpoolingComponent(const MaxpoolingComponent &component);
1859
1860   virtual int32 InputDim() const;
1861   virtual int32 OutputDim() const;
1862   virtual void Check() const;
1863
1864   virtual std::string Info() const;
1865   virtual void InitFromConfig(ConfigLine *cfl);
1866   virtual std::string Type() const { return "MaxpoolingComponent"; }
1867   virtual int32 Properties() const {
1868     return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
1869            kBackpropAdds;
1870   }
1871
1872   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1873                          const CuMatrixBase<BaseFloat> &in,
1874                          CuMatrixBase<BaseFloat> *out) const;
1875   virtual void Backprop(const std::string &debug_info,
1876                         const ComponentPrecomputedIndexes *indexes,
1877                         const CuMatrixBase<BaseFloat> &in_value,
1878                         const CuMatrixBase<BaseFloat> &out_value,
1879                         const CuMatrixBase<BaseFloat> &out_deriv,
1880                         Component *, // to_update,
1881                         CuMatrixBase<BaseFloat> *in_deriv) const;
1882
1883   virtual void Read(std::istream &is, bool binary); // This Read function
1884   // requires that the Component has the correct type.
1885
1886   /// Write component to stream
1887   virtual void Write(std::ostream &os, bool binary) const;
1888   virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
1889
1890   void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1891                            CuMatrix<BaseFloat> *patches) const;
1892   void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1893                                CuMatrixBase<BaseFloat> *in_deriv) const;
1894
1895  protected:
1896   int32 input_x_dim_;   // size of the input along x-axis
1897   // (e.g. number of time steps)
1898   int32 input_y_dim_;   // size of input along y-axis
1899   // (e.g. number of mel-frequency bins)
1900   int32 input_z_dim_;   // size of input along z-axis
1901   // (e.g. number of filters in the ConvolutionComponent)
1902
1903   int32 pool_x_size_;    // size of the pooling window along x-axis
1904   int32 pool_y_size_;    // size of the pooling window along y-axis
1905   int32 pool_z_size_;    // size of the pooling window along z-axis
1906
1907   int32 pool_x_step_;   // the number of steps taken along x-axis of input
1908   //  before computing the next pool
1909   int32 pool_y_step_;   // the number of steps taken along y-axis of input
1910   // before computing the next pool
1911   int32 pool_z_step_;   // the number of steps taken along z-axis of input
1912   // before computing the next pool
1913
1914 };
1915
1916
1917 /**
1918    CompositeComponent is a component representing a sequence of
1919    [simple] components.  The config line would be something like the following
1920    (imagine this is all on one line):
1921
1922    component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
1923       component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
1924       component2='type=RectifiedLinearComponent dim=10000' \
1925       component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
1926
1927    The reason you might want to use this component, instead of directly using
1928    the same sequence of components in the config file, is to save GPU memory (at
1929    the expense of more compute)-- because doing it like this means we have to
1930    re-do parts of the forward pass in the backprop phase, but we avoid using
1931    much memory for very long (and you can make the memory usage very small by
1932    making max-rows-process small).  We inherit from UpdatableComponent just in
1933    case one or more of the components in the sequence are updatable.
1934
1935    It is an error to nest a CompositeComponent inside a CompositeComponent.
1936    The same effect can be accomplished by specifying a smaller max-rows-process
1937    in a single CompositeComponent.
1938  */
1939 class CompositeComponent: public UpdatableComponent {
1940  public:
1941   virtual int32 InputDim() const;
1942   virtual int32 OutputDim() const;
1943
1944   virtual std::string Info() const;
1945
1946   virtual void InitFromConfig(ConfigLine *cfl);
1947
1948   virtual Component* Copy() const;
1949
1950   CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
1951
1952   // Initialize from this list of components; takes ownership of the pointers.
1953   void Init(const std::vector<Component*> &components,
1954             int32 max_rows_process);
1955
1956   virtual std::string Type() const { return "CompositeComponent"; }
1957
1958   // The properties depend on the properties of the constituent components.  As
1959   // a special case, we never return kStoresStats in the properties: by default
1960   // we store things like activation stats (e.g. for nonlinear components like
1961   // ReLU) as part of the backprop.  This means we may wastefully store stats
1962   // even when not requested, but it does save time as a separate StoreStats()
1963   // call would involve propagating the internals.
1964   virtual int32 Properties() const;
1965
1966   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1967                          const CuMatrixBase<BaseFloat> &in,
1968                          CuMatrixBase<BaseFloat> *out) const;
1969   virtual void Backprop(const std::string &debug_info,
1970                         const ComponentPrecomputedIndexes *indexes,
1971                         const CuMatrixBase<BaseFloat> &in_value,
1972                         const CuMatrixBase<BaseFloat> &, // out_value
1973                         const CuMatrixBase<BaseFloat> &out_deriv,
1974                         Component *to_update,
1975                         CuMatrixBase<BaseFloat> *in_deriv) const;
1976
1977   // note, we don't implement StoreStats() as it would be inefficient.  Instead,
1978   // by default we call StoreStats() on all members that have the flag set,
1979   // inside the Backprop.
1980   virtual void ZeroStats();
1981
1982   virtual void Read(std::istream &is, bool binary);
1983   virtual void Write(std::ostream &os, bool binary) const;
1984
1985   // Don't implement Copy() at this level: implement it in the child class.
1986
1987   // Some functions from base-class UpdatableComponent.
1988   virtual void SetUnderlyingLearningRate(BaseFloat lrate);
1989   virtual void SetActualLearningRate(BaseFloat lrate);
1990   virtual void SetAsGradient();
1991   virtual void Scale(BaseFloat scale);
1992   virtual void Add(BaseFloat alpha, const Component &other);
1993   virtual void PerturbParams(BaseFloat stddev);
1994   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1995   virtual int32 NumParameters() const;
1996   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1997   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1998
1999   // note: we dont implement the StoreStats function as it would be quite
2000   // expensive; instead, by default we call StoreStats() for any components that
2001   // want to store stats, as part of the backprop pass.  This is not 100% ideal
2002   // but it will usually do what you want.  We can revisit this later if needed.
2003
2004   // Functions to iterate over the internal components
2005
2006   int32 NumComponents() const { return components_.size();}
2007   /// Gets the ith component in this component.
2008   /// The ordering is the same as in the config line. The caller
2009   /// does not own the received component.
2010   const Component* GetComponent(int32 i) const;
2011   /// Sets the ith component. After this call, CompositeComponent owns
2012   /// the reference to the argument component. Frees the previous
2013   /// ith component.
2014   void SetComponent(int32 i, Component *component);
2015
2016   virtual ~CompositeComponent() { DeletePointers(&components_); }
2017  private:
2018   // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2019   // at the output of the i'th component.
2020   inline MatrixStrideType GetStrideType(int32 i) const;
2021
2022   // returns true if at least one of 'components_' returns the kUpdatable flag
2023   // in its flags.
2024   bool IsUpdatable() const;
2025
2026   // the maximum number of
2027   int32 max_rows_process_;
2028   std::vector<Component*> components_;
2029
2030 };
2031
2032
2033 } // namespace nnet3
2034 } // namespace kaldi
2035
2036
2037 #endif