src/nnet3/nnet-simple-component.h

   1 // nnet3/nnet-simple-component.h
   2
   3 // Copyright 2011-2013  Karel Vesely
   4 //           2012-2015  Johns Hopkins University (author: Daniel Povey)
   5 //                2013  Xiaohui Zhang
   6 //           2014-2015  Vijayaditya Peddinti
   7 //           2014-2015  Guoguo Chen
   8 //                2015  Daniel Galvez
   9 //                2015  Tom Ko
  10
  11 // See ../../COPYING for clarification regarding multiple authors
  12 //
  13 // Licensed under the Apache License, Version 2.0 (the "License");
  14 // you may not use this file except in compliance with the License.
  15 // You may obtain a copy of the License at
  16 //
  17 //  http://www.apache.org/licenses/LICENSE-2.0
  18 //
  19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  22 // MERCHANTABLITY OR NON-INFRINGEMENT.
  23 // See the Apache 2 License for the specific language governing permissions and
  24 // limitations under the License.
  25
  26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
  27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
  28
  29 #include "nnet3/nnet-common.h"
  30 #include "nnet3/nnet-component-itf.h"
  31 #include "nnet3/natural-gradient-online.h"
  32 #include <iostream>
  33
  34 namespace kaldi {
  35 namespace nnet3 {
  36
  37 /// @file  nnet-simple-component.h
  38 ///   This file contains declarations of components that are "simple", meaning
  39 ///   they don't care about the indexes they are operating on, produce one
  40 ///   output for one input, and return the kSimpleComponent flag in their
  41 ///   Properties(): for example, tanh and affine components.  In
  42 ///   nnet-general-component.h there are components that don't fit this pattern.
  43
  44 // This "nnet3" version of the p-norm component only supports the 2-norm.
  45 class PnormComponent: public Component {
  46  public:
  47   void Init(int32 input_dim, int32 output_dim);
  48   explicit PnormComponent(int32 input_dim, int32 output_dim) {
  49     Init(input_dim, output_dim);
  50   }
  51   virtual int32 Properties() const {
  52     return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
  53   }
  54   PnormComponent(): input_dim_(0), output_dim_(0) { }
  55   virtual std::string Type() const { return "PnormComponent"; }
  56   virtual void InitFromConfig(ConfigLine *cfl);
  57   virtual int32 InputDim() const { return input_dim_; }
  58   virtual int32 OutputDim() const { return output_dim_; }
  59   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
  60                          const CuMatrixBase<BaseFloat> &in,
  61                          CuMatrixBase<BaseFloat> *out) const;
  62   virtual void Backprop(const std::string &debug_info,
  63                         const ComponentPrecomputedIndexes *indexes,
  64                         const CuMatrixBase<BaseFloat> &in_value,
  65                         const CuMatrixBase<BaseFloat> &out_value,
  66                         const CuMatrixBase<BaseFloat> &out_deriv,
  67                         Component *to_update,
  68                         CuMatrixBase<BaseFloat> *in_deriv) const;
  69   virtual Component* Copy() const { return new PnormComponent(input_dim_,
  70                                                               output_dim_); }
  71
  72   virtual void Read(std::istream &is, bool binary); // This Read function
  73   // requires that the Component has the correct type.
  74
  75   /// Write component to stream
  76   virtual void Write(std::ostream &os, bool binary) const;
  77
  78  protected:
  79   int32 input_dim_;
  80   int32 output_dim_;
  81 };
  82
  83 // This component randomly zeros dropout_proportion of the input
  84 // and the derivatives are backpropagated through the nonzero inputs.
  85 // Typically this component used during training but not in test time.
  86 // The idea is described under the name Dropout, in the paper
  87 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
  88 class DropoutComponent : public RandomComponent {
  89  public:
  90   void Init(int32 dim, BaseFloat dropout_proportion = 0.0,
  91             bool dropout_per_frame = false);
  92
  93   DropoutComponent(int32 dim, BaseFloat dropout = 0.0,
  94                    bool dropout_per_frame = false) {
  95     Init(dim, dropout, dropout_per_frame);
  96   }
  97
  98   DropoutComponent(): dim_(0), dropout_proportion_(0.0),
  99                       dropout_per_frame_(false) { }
 100
 101   virtual int32 Properties() const {
 102     return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
 103   }
 104   virtual std::string Type() const { return "DropoutComponent"; }
 105
 106   virtual void InitFromConfig(ConfigLine *cfl);
 107
 108   virtual int32 InputDim() const { return dim_; }
 109
 110   virtual int32 OutputDim() const { return dim_; }
 111
 112   virtual void Read(std::istream &is, bool binary);
 113
 114   // Write component to stream
 115   virtual void Write(std::ostream &os, bool binary) const;
 116
 117   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 118                          const CuMatrixBase<BaseFloat> &in,
 119                          CuMatrixBase<BaseFloat> *out) const;
 120   virtual void Backprop(const std::string &debug_info,
 121                         const ComponentPrecomputedIndexes *indexes,
 122                         const CuMatrixBase<BaseFloat> &in_value,
 123                         const CuMatrixBase<BaseFloat> &out_value,
 124                         const CuMatrixBase<BaseFloat> &out_deriv,
 125                         Component *to_update,
 126                         CuMatrixBase<BaseFloat> *in_deriv) const;
 127   virtual Component* Copy() const { return new DropoutComponent(dim_,
 128                                                dropout_proportion_,
 129                                                dropout_per_frame_); }
 130   virtual std::string Info() const;
 131
 132   void SetDropoutProportion(BaseFloat dropout_proportion) {
 133     dropout_proportion_ = dropout_proportion;
 134   }
 135
 136  private:
 137   int32 dim_;
 138   /// dropout-proportion is the proportion that is dropped out,
 139   /// e.g. if 0.1, we set 10% to zero value.
 140   BaseFloat dropout_proportion_;
 141   bool dropout_per_frame_;
 142 };
 143
 144 class ElementwiseProductComponent: public Component {
 145  public:
 146   void Init(int32 input_dim, int32 output_dim);
 147   explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
 148     Init(input_dim, output_dim);
 149   }
 150   virtual int32 Properties() const {
 151     return kSimpleComponent|kBackpropNeedsInput;
 152   }
 153   ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
 154   virtual std::string Type() const { return "ElementwiseProductComponent"; }
 155   virtual void InitFromConfig(ConfigLine *cfl);
 156   virtual int32 InputDim() const { return input_dim_; }
 157   virtual int32 OutputDim() const { return output_dim_; }
 158   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 159                          const CuMatrixBase<BaseFloat> &in,
 160                          CuMatrixBase<BaseFloat> *out) const;
 161   virtual void Backprop(const std::string &debug_info,
 162                         const ComponentPrecomputedIndexes *indexes,
 163                         const CuMatrixBase<BaseFloat> &in_value,
 164                         const CuMatrixBase<BaseFloat> &out_value,
 165                         const CuMatrixBase<BaseFloat> &out_deriv,
 166                         Component *to_update,
 167                         CuMatrixBase<BaseFloat> *in_deriv) const;
 168   virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
 169                                                               output_dim_); }
 170
 171   virtual void Read(std::istream &is, bool binary); // This Read function
 172   // requires that the Component has the correct type.
 173
 174   /// Write component to stream
 175   virtual void Write(std::ostream &os, bool binary) const;
 176
 177  protected:
 178   int32 input_dim_;
 179   int32 output_dim_;
 180 };
 181
 182 class NormalizeComponent: public Component {
 183  public:
 184  void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
 185   explicit NormalizeComponent(int32 input_dim,
 186                               BaseFloat target_rms = 1.0,
 187                               bool add_log_stddev = false) {
 188     Init(input_dim, target_rms, add_log_stddev);
 189   }
 190   explicit NormalizeComponent(const NormalizeComponent &other);
 191   // note: there is some special code in NonlinerComponent::Info() that
 192   // specifically caters to this class.
 193   virtual int32 Properties() const {
 194     return (add_log_stddev_ ?
 195             kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
 196             kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
 197             kBackpropAdds|kBackpropInPlace);
 198   }
 199   NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
 200   virtual std::string Type() const { return "NormalizeComponent"; }
 201   virtual void InitFromConfig(ConfigLine *cfl);
 202   virtual Component* Copy() const { return new NormalizeComponent(*this); }
 203   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 204                          const CuMatrixBase<BaseFloat> &in,
 205                          CuMatrixBase<BaseFloat> *out) const;
 206   virtual void Backprop(const std::string &debug_info,
 207                         const ComponentPrecomputedIndexes *indexes,
 208                         const CuMatrixBase<BaseFloat> &in_value,
 209                         const CuMatrixBase<BaseFloat> &, // out_value
 210                         const CuMatrixBase<BaseFloat> &out_deriv,
 211                         Component *to_update,
 212                         CuMatrixBase<BaseFloat> *in_deriv) const;
 213
 214   virtual void Read(std::istream &is, bool binary);
 215   virtual void Write(std::ostream &os, bool binary) const;
 216   virtual int32 InputDim() const { return input_dim_; }
 217   virtual int32 OutputDim() const {
 218     return (input_dim_ + (add_log_stddev_ ? 1 : 0));
 219   }
 220   virtual std::string Info() const;
 221  private:
 222   NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
 223   enum { kExpSquaredNormFloor = -66 };
 224   static const BaseFloat kSquaredNormFloor;
 225   int32 input_dim_;
 226   BaseFloat target_rms_; // The target rms for outputs.
 227   // about 0.7e-20.  We need a value that's exactly representable in
 228   // float and whose inverse square root is also exactly representable
 229   // in float (hence, an even power of two).
 230
 231   bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
 232                         // is an extra dimension of the output.
 233 };
 234
 235
 236 class SigmoidComponent: public NonlinearComponent {
 237  public:
 238   explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
 239   SigmoidComponent() { }
 240   virtual std::string Type() const { return "SigmoidComponent"; }
 241   virtual int32 Properties() const {
 242     return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
 243   }
 244   virtual Component* Copy() const { return new SigmoidComponent(*this); }
 245   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 246                          const CuMatrixBase<BaseFloat> &in,
 247                          CuMatrixBase<BaseFloat> *out) const;
 248   virtual void Backprop(const std::string &debug_info,
 249                         const ComponentPrecomputedIndexes *indexes,
 250                         const CuMatrixBase<BaseFloat> &, //in_value
 251                         const CuMatrixBase<BaseFloat> &out_value,
 252                         const CuMatrixBase<BaseFloat> &out_deriv,
 253                         Component *to_update,
 254                         CuMatrixBase<BaseFloat> *in_deriv) const;
 255   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 256  private:
 257   // this function is called from Backprop code and only does something if the
 258   // self-repair-scale config value is set.
 259   void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
 260                        CuMatrixBase<BaseFloat> *in_deriv,
 261                        SigmoidComponent *to_update) const;
 262
 263   SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
 264 };
 265
 266 class TanhComponent: public NonlinearComponent {
 267  public:
 268   explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
 269   TanhComponent() { }
 270   virtual std::string Type() const { return "TanhComponent"; }
 271   virtual Component* Copy() const { return new TanhComponent(*this); }
 272   virtual int32 Properties() const {
 273     return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
 274   }
 275   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 276                          const CuMatrixBase<BaseFloat> &in,
 277                          CuMatrixBase<BaseFloat> *out) const;
 278   virtual void Backprop(const std::string &debug_info,
 279                         const ComponentPrecomputedIndexes *indexes,
 280                         const CuMatrixBase<BaseFloat> &, //in_value
 281                         const CuMatrixBase<BaseFloat> &out_value,
 282                         const CuMatrixBase<BaseFloat> &out_deriv,
 283                         Component *to_update,
 284                         CuMatrixBase<BaseFloat> *in_deriv) const;
 285   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 286  private:
 287   // this function is called from Backprop code and only does something if the
 288   // self-repair-scale config value is set.
 289   void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
 290                        CuMatrixBase<BaseFloat> *in_deriv,
 291                        TanhComponent *to_update) const;
 292
 293   TanhComponent &operator = (const TanhComponent &other); // Disallow.
 294 };
 295
 296
 297 class RectifiedLinearComponent: public NonlinearComponent {
 298  public:
 299   explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
 300       NonlinearComponent(other) { }
 301   RectifiedLinearComponent() { }
 302   virtual std::string Type() const { return "RectifiedLinearComponent"; }
 303   virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
 304   virtual int32 Properties() const {
 305     return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
 306         kStoresStats;
 307   }
 308   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 309                          const CuMatrixBase<BaseFloat> &in,
 310                          CuMatrixBase<BaseFloat> *out) const;
 311   virtual void Backprop(const std::string &debug_info,
 312                         const ComponentPrecomputedIndexes *indexes,
 313                         const CuMatrixBase<BaseFloat> &, //in_value
 314                         const CuMatrixBase<BaseFloat> &out_value,
 315                         const CuMatrixBase<BaseFloat> &out_deriv,
 316                         Component *to_update,
 317                         CuMatrixBase<BaseFloat> *in_deriv) const;
 318   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 319
 320  private:
 321   // this function is called from Backprop code and only does something if the
 322   // self-repair-scale config value is set.
 323   void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
 324                        RectifiedLinearComponent *to_update) const;
 325
 326   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 327 };
 328
 329 /**
 330    This component is a fixed (non-trainable) nonlinearity that sums its inputs
 331    to produce outputs.  Currently the only supported configuration is that its
 332    input-dim is interpreted as consisting of n blocks, and the output is just a
 333    summation over the n blocks, where  n = input-dim / output-dim, so for instance
 334     output[n] = input[n] + input[block-size + n] + .... .
 335    Later if needed we can add a configuration variable that allows you to sum
 336    over 'interleaved' input.
 337  */
 338 class SumReduceComponent: public Component {
 339  public:
 340   void Init(int32 input_dim, int32 output_dim);
 341   explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
 342     Init(input_dim, output_dim);
 343   }
 344   virtual int32 Properties() const {
 345     return kSimpleComponent|kLinearInInput;
 346   }
 347   SumReduceComponent(): input_dim_(0), output_dim_(0) { }
 348   virtual std::string Type() const { return "SumReduceComponent"; }
 349   virtual void InitFromConfig(ConfigLine *cfl);
 350   virtual int32 InputDim() const { return input_dim_; }
 351   virtual int32 OutputDim() const { return output_dim_; }
 352   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 353                          const CuMatrixBase<BaseFloat> &in,
 354                          CuMatrixBase<BaseFloat> *out) const;
 355   virtual void Backprop(const std::string &debug_info,
 356                         const ComponentPrecomputedIndexes *indexes,
 357                         const CuMatrixBase<BaseFloat> &, // in_value
 358                         const CuMatrixBase<BaseFloat> &, // out_value,
 359                         const CuMatrixBase<BaseFloat> &out_deriv,
 360                         Component *, // to_update
 361                         CuMatrixBase<BaseFloat> *in_deriv) const;
 362   virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
 363                                                                   output_dim_); }
 364
 365   virtual void Read(std::istream &is, bool binary); // This Read function
 366   // requires that the Component has the correct type.
 367
 368   /// Write component to stream
 369   virtual void Write(std::ostream &os, bool binary) const;
 370
 371  protected:
 372   int32 input_dim_;
 373   int32 output_dim_;
 374 };
 375
 376
 377 class FixedAffineComponent;
 378 class FixedScaleComponent;
 379 class PerElementScaleComponent;
 380 class PerElementOffsetComponent;
 381
 382 // Affine means a linear function plus an offset.
 383 // Note: although this class can be instantiated, it also
 384 // functions as a base-class for more specialized versions of
 385 // AffineComponent.
 386 class AffineComponent: public UpdatableComponent {
 387   friend class SoftmaxComponent; // Friend declaration relates to mixing up.
 388  public:
 389
 390   virtual int32 InputDim() const { return linear_params_.NumCols(); }
 391   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 392
 393   virtual std::string Info() const;
 394   virtual void InitFromConfig(ConfigLine *cfl);
 395
 396   AffineComponent() { } // use Init to really initialize.
 397   virtual std::string Type() const { return "AffineComponent"; }
 398   virtual int32 Properties() const {
 399     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 400         kBackpropNeedsInput|kBackpropAdds;
 401   }
 402
 403
 404   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 405                          const CuMatrixBase<BaseFloat> &in,
 406                          CuMatrixBase<BaseFloat> *out) const;
 407   virtual void Backprop(const std::string &debug_info,
 408                         const ComponentPrecomputedIndexes *indexes,
 409                         const CuMatrixBase<BaseFloat> &in_value,
 410                         const CuMatrixBase<BaseFloat> &, // out_value
 411                         const CuMatrixBase<BaseFloat> &out_deriv,
 412                         Component *to_update,
 413                         CuMatrixBase<BaseFloat> *in_deriv) const;
 414
 415   virtual void Read(std::istream &is, bool binary);
 416   virtual void Write(std::ostream &os, bool binary) const;
 417
 418   virtual Component* Copy() const;
 419
 420
 421   // Some functions from base-class UpdatableComponent.
 422   virtual void Scale(BaseFloat scale);
 423   virtual void Add(BaseFloat alpha, const Component &other);
 424   virtual void PerturbParams(BaseFloat stddev);
 425   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 426   virtual int32 NumParameters() const;
 427   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 428   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 429
 430   // Some functions that are specific to this class.
 431
 432   // This new function is used when mixing up:
 433   virtual void SetParams(const VectorBase<BaseFloat> &bias,
 434                          const MatrixBase<BaseFloat> &linear);
 435   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
 436   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 437   explicit AffineComponent(const AffineComponent &other);
 438   // The next constructor is used in converting from nnet1.
 439   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
 440                   const CuVectorBase<BaseFloat> &bias_params,
 441                   BaseFloat learning_rate);
 442   void Init(int32 input_dim, int32 output_dim,
 443             BaseFloat param_stddev, BaseFloat bias_stddev);
 444   void Init(std::string matrix_filename);
 445
 446   // This function resizes the dimensions of the component, setting the
 447   // parameters to zero, while leaving any other configuration values the same.
 448   virtual void Resize(int32 input_dim, int32 output_dim);
 449
 450   // The following functions are used for collapsing multiple layers
 451   // together.  They return a pointer to a new Component equivalent to
 452   // the sequence of two components.  We haven't implemented this for
 453   // FixedLinearComponent yet.
 454   Component *CollapseWithNext(const AffineComponent &next) const ;
 455   Component *CollapseWithNext(const FixedAffineComponent &next) const;
 456   Component *CollapseWithNext(const FixedScaleComponent &next) const;
 457   Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
 458
 459  protected:
 460   friend class NaturalGradientAffineComponent;
 461   // This function Update() is for extensibility; child classes may override
 462   // this, e.g. for natural gradient update.
 463   virtual void Update(
 464       const std::string &debug_info,
 465       const CuMatrixBase<BaseFloat> &in_value,
 466       const CuMatrixBase<BaseFloat> &out_deriv) {
 467     UpdateSimple(in_value, out_deriv);
 468   }
 469   // UpdateSimple is used when *this is a gradient.  Child classes may override
 470   // this if needed, but typically won't need to.
 471   virtual void UpdateSimple(
 472       const CuMatrixBase<BaseFloat> &in_value,
 473       const CuMatrixBase<BaseFloat> &out_deriv);
 474
 475   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
 476   CuMatrix<BaseFloat> linear_params_;
 477   CuVector<BaseFloat> bias_params_;
 478 };
 479
 480 class RepeatedAffineComponent;
 481
 482 /// This class implements an affine transform using a block diagonal matrix
 483 /// e.g., one whose weight matrix is all zeros except for blocks on the
 484 /// diagonal. All these blocks have the same dimensions.
 485 ///  input-dim: num cols of block diagonal matrix.
 486 ///  output-dim: num rows of block diagonal matrix.
 487 /// num-blocks: number of blocks in diagonal of the matrix.
 488 /// num-blocks must divide both input-dim and output-dim
 489 class BlockAffineComponent : public UpdatableComponent {
 490  public:
 491   virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
 492   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 493
 494   virtual std::string Info() const;
 495   virtual void InitFromConfig(ConfigLine *cfl);
 496
 497   BlockAffineComponent() { }
 498   virtual std::string Type() const { return "BlockAffineComponent"; }
 499   virtual int32 Properties() const {
 500     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 501       kBackpropNeedsInput|kBackpropAdds;
 502   }
 503
 504   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 505                          const CuMatrixBase<BaseFloat> &in,
 506                          CuMatrixBase<BaseFloat> *out) const;
 507
 508   virtual void Backprop(const std::string &debug_info,
 509                         const ComponentPrecomputedIndexes *indexes,
 510                         const CuMatrixBase<BaseFloat> &in_value,
 511                         const CuMatrixBase<BaseFloat> &, // out_value
 512                         const CuMatrixBase<BaseFloat> &out_deriv,
 513                         Component *to_update,
 514                         CuMatrixBase<BaseFloat> *in_deriv) const;
 515
 516   virtual void Read(std::istream &is, bool binary);
 517   virtual void Write(std::ostream &os, bool binary) const;
 518
 519   virtual Component* Copy() const;
 520
 521   // Functions from base-class UpdatableComponent.
 522   virtual void Scale(BaseFloat scale);
 523   virtual void Add(BaseFloat alpha, const Component &other);
 524   virtual void PerturbParams(BaseFloat stddev);
 525   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 526   virtual int32 NumParameters() const;
 527   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 528   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 529
 530   // BlockAffine-specific functions.
 531   void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
 532             BaseFloat param_stddev, BaseFloat bias_mean,
 533             BaseFloat bias_stddev);
 534   explicit BlockAffineComponent(const BlockAffineComponent &other);
 535   explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
 536  protected:
 537   // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
 538   // equal size.  The blocks are stored in linear_params_ as
 539   // [ M
 540   //   N
 541   //   O ] but we actually treat it as the matrix:
 542   // [ M 0 0
 543   //   0 N 0
 544   //   0 0 O ]
 545   CuMatrix<BaseFloat> linear_params_;
 546   CuVector<BaseFloat> bias_params_;
 547   int32 num_blocks_;
 548  private:
 549   const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
 550 };
 551
 552 class RepeatedAffineComponent: public UpdatableComponent {
 553  public:
 554
 555   virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
 556   virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
 557
 558   virtual std::string Info() const;
 559   virtual void InitFromConfig(ConfigLine *cfl);
 560
 561   RepeatedAffineComponent() { } // use Init to really initialize.
 562   virtual std::string Type() const { return "RepeatedAffineComponent"; }
 563   virtual int32 Properties() const {
 564     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 565         kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
 566   }
 567   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 568                          const CuMatrixBase<BaseFloat> &in,
 569                          CuMatrixBase<BaseFloat> *out) const;
 570   virtual void Backprop(const std::string &debug_info,
 571                         const ComponentPrecomputedIndexes *indexes,
 572                         const CuMatrixBase<BaseFloat> &in_value,
 573                         const CuMatrixBase<BaseFloat> &, // out_value
 574                         const CuMatrixBase<BaseFloat> &out_deriv,
 575                         Component *to_update,
 576                         CuMatrixBase<BaseFloat> *in_deriv) const;
 577
 578   virtual void Read(std::istream &is, bool binary);
 579   virtual void Write(std::ostream &os, bool binary) const;
 580
 581   virtual Component* Copy() const;
 582
 583   // Some functions from base-class UpdatableComponent.
 584   virtual void Scale(BaseFloat scale);
 585   virtual void Add(BaseFloat alpha, const Component &other);
 586   virtual void PerturbParams(BaseFloat stddev);
 587   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 588   virtual int32 NumParameters() const;
 589   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 590   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 591
 592   // Some functions that are specific to this class.
 593   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
 594   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 595   explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
 596
 597   void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
 598             BaseFloat param_stddev, BaseFloat bias_mean,
 599             BaseFloat bias_stddev);
 600   friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
 601  protected:
 602   // This function Update(), called from backprop, is broken out for
 603   // extensibility to natural gradient update.
 604   virtual void Update(
 605       const CuMatrixBase<BaseFloat> &in_value,
 606       const CuMatrixBase<BaseFloat> &out_deriv);
 607
 608   // This function does nothing here but is redefined in child-class
 609   // NaturalGradientRepeatedAffineComponent.  This help avoid repeated code.
 610   virtual void SetNaturalGradientConfigs() { }
 611
 612   const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
 613   CuMatrix<BaseFloat> linear_params_;
 614   CuVector<BaseFloat> bias_params_;
 615   int32 num_repeats_;
 616 };
 617
 618 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
 619  public:
 620   // Use Init() to really initialize.
 621   NaturalGradientRepeatedAffineComponent() { }
 622
 623   // Most of the public functions are inherited from RepeatedAffineComponent.
 624   virtual std::string Type() const {
 625     return "NaturalGradientRepeatedAffineComponent";
 626   }
 627
 628   virtual Component* Copy() const;
 629
 630   // Copy constructor
 631   explicit NaturalGradientRepeatedAffineComponent(
 632       const NaturalGradientRepeatedAffineComponent &other);
 633  private:
 634   virtual void Update(
 635       const CuMatrixBase<BaseFloat> &in_value,
 636       const CuMatrixBase<BaseFloat> &out_deriv);
 637
 638   const NaturalGradientRepeatedAffineComponent &operator=(
 639       const NaturalGradientRepeatedAffineComponent &other); // Disallow.
 640
 641   // Applies the default configuration to preconditioner_in_.
 642   virtual void SetNaturalGradientConfigs();
 643
 644   // For efficiency reasons we only apply the natural gradient to the input
 645   // side, i.e. not to the space of output derivatives-- we believe the input
 646   // side is the more important side.  We don't make the natural-gradient
 647   // configurable; we just give it a reasonable configuration.
 648   // Instead of using the individual data-points, for efficiency reasons we use
 649   // the distribution of per-minibatch summed derivatives over each dimension of
 650   // the output space, as the source for the Fisher matrix.
 651   OnlineNaturalGradient preconditioner_in_;
 652 };
 653
 654 class SoftmaxComponent: public NonlinearComponent {
 655  public:
 656   explicit SoftmaxComponent(const SoftmaxComponent &other):
 657       NonlinearComponent(other) { }
 658   SoftmaxComponent() { }
 659   virtual std::string Type() const { return "SoftmaxComponent"; }
 660   virtual int32 Properties() const {
 661     return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
 662   }
 663   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 664                          const CuMatrixBase<BaseFloat> &in,
 665                          CuMatrixBase<BaseFloat> *out) const;
 666   virtual void Backprop(const std::string &debug_info,
 667                         const ComponentPrecomputedIndexes *indexes,
 668                         const CuMatrixBase<BaseFloat> &in_value,
 669                         const CuMatrixBase<BaseFloat> &out_value,
 670                         const CuMatrixBase<BaseFloat> &out_deriv,
 671                         Component *to_update,
 672                         CuMatrixBase<BaseFloat> *in_deriv) const;
 673   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 674
 675   virtual Component* Copy() const { return new SoftmaxComponent(*this); }
 676  private:
 677   SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
 678 };
 679
 680 class LogSoftmaxComponent: public NonlinearComponent {
 681  public:
 682   explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
 683       NonlinearComponent(other) { }
 684   LogSoftmaxComponent() { }
 685   virtual std::string Type() const { return "LogSoftmaxComponent"; }
 686   virtual int32 Properties() const {
 687     return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
 688   }
 689   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 690                          const CuMatrixBase<BaseFloat> &in,
 691                          CuMatrixBase<BaseFloat> *out) const;
 692   virtual void Backprop(const std::string &debug_info,
 693                         const ComponentPrecomputedIndexes *indexes,
 694                         const CuMatrixBase<BaseFloat> &in_value,
 695                         const CuMatrixBase<BaseFloat> &out_value,
 696                         const CuMatrixBase<BaseFloat> &out_deriv,
 697                         Component *to_update,
 698                         CuMatrixBase<BaseFloat> *in_deriv) const;
 699
 700   virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
 701  private:
 702   LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
 703 };
 704
 705 /// Keywords: natural gradient descent, NG-SGD, naturalgradient.  For
 706 /// the top-level of the natural gradient code look here, and also in
 707 /// nnet-precondition-online.h.
 708 /// NaturalGradientAffineComponent is
 709 /// a version of AffineComponent that has a non-(multiple of unit) learning-rate
 710 /// matrix.  See nnet-precondition-online.h for a description of the technique.
 711 /// It is described, under the name Online NG-SGD, in the paper "Parallel
 712 /// training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
 713 /// workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
 714 class NaturalGradientAffineComponent: public AffineComponent {
 715  public:
 716   virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
 717   virtual void Read(std::istream &is, bool binary);
 718   virtual void Write(std::ostream &os, bool binary) const;
 719   void Init(int32 input_dim, int32 output_dim,
 720             BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
 721             int32 rank_in, int32 rank_out, int32 update_period,
 722             BaseFloat num_samples_history, BaseFloat alpha,
 723             BaseFloat max_change_per_sample);
 724   void Init(int32 rank_in, int32 rank_out, int32 update_period,
 725             BaseFloat num_samples_history,
 726             BaseFloat alpha, BaseFloat max_change_per_sample,
 727             std::string matrix_filename);
 728   // this constructor does not really initialize, use Init() or Read().
 729   NaturalGradientAffineComponent();
 730   virtual void Resize(int32 input_dim, int32 output_dim);
 731   virtual void InitFromConfig(ConfigLine *cfl);
 732   virtual std::string Info() const;
 733   virtual Component* Copy() const;
 734   virtual void Scale(BaseFloat scale);
 735   virtual void Add(BaseFloat alpha, const Component &other);
 736   // copy constructor
 737   explicit NaturalGradientAffineComponent(
 738       const NaturalGradientAffineComponent &other);
 739   virtual void ZeroStats();
 740
 741  private:
 742   // disallow assignment operator.
 743   NaturalGradientAffineComponent &operator= (
 744       const NaturalGradientAffineComponent&);
 745
 746   // Configs for preconditioner.  The input side tends to be better conditioned ->
 747   // smaller rank needed, so make them separately configurable.
 748   int32 rank_in_;
 749   int32 rank_out_;
 750   int32 update_period_;
 751   BaseFloat num_samples_history_;
 752   BaseFloat alpha_;
 753
 754   OnlineNaturalGradient preconditioner_in_;
 755
 756   OnlineNaturalGradient preconditioner_out_;
 757
 758   // If > 0, max_change_per_sample_ is the maximum amount of parameter
 759   // change (in L2 norm) that we allow per sample, averaged over the minibatch.
 760   // This was introduced in order to control instability.
 761   // Instead of the exact L2 parameter change, for
 762   // efficiency purposes we limit a bound on the exact
 763   // change.  The limit is applied via a constant <= 1.0
 764   // for each minibatch, A suitable value might be, for
 765   // example, 10 or so; larger if there are more
 766   // parameters.
 767   BaseFloat max_change_per_sample_;
 768
 769   // update_count_ records how many updates we have done.
 770   double update_count_;
 771
 772   // active_scaling_count_ records how many updates we have done,
 773   // where the scaling factor is active (not 1.0).
 774   double active_scaling_count_;
 775
 776   // max_change_scale_stats_ records the sum of scaling factors
 777   // in each update, so we can compute the averaged scaling factor
 778   // in Info().
 779   double max_change_scale_stats_;
 780
 781   // Sets the configs rank, alpha and eta in the preconditioner objects,
 782   // from the class variables.
 783   void SetNaturalGradientConfigs();
 784
 785   virtual void Update(
 786       const std::string &debug_info,
 787       const CuMatrixBase<BaseFloat> &in_value,
 788       const CuMatrixBase<BaseFloat> &out_deriv);
 789 };
 790
 791
 792 /// FixedAffineComponent is an affine transform that is supplied
 793 /// at network initialization time and is not trainable.
 794 class FixedAffineComponent: public Component {
 795  public:
 796   FixedAffineComponent() { }
 797   virtual std::string Type() const { return "FixedAffineComponent"; }
 798   virtual std::string Info() const;
 799
 800   // Copy constructor from AffineComponent-- can be used when we're done
 801   // training a particular part of the model and want to efficiently disable
 802   // further training.
 803   FixedAffineComponent(const AffineComponent &c);
 804
 805   /// matrix should be of size input-dim+1 to output-dim, last col is offset
 806   void Init(const CuMatrixBase<BaseFloat> &matrix);
 807
 808   // The ConfigLine cfl contains just the option matrix=<string>,
 809   // where the string is the filename of a Kaldi-format matrix to read.
 810   virtual void InitFromConfig(ConfigLine *cfl);
 811
 812   virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
 813   virtual int32 InputDim() const { return linear_params_.NumCols(); }
 814   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 815
 816   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 817                          const CuMatrixBase<BaseFloat> &in,
 818                          CuMatrixBase<BaseFloat> *out) const;
 819   virtual void Backprop(const std::string &debug_info,
 820                         const ComponentPrecomputedIndexes *indexes,
 821                         const CuMatrixBase<BaseFloat> &in_value,
 822                         const CuMatrixBase<BaseFloat> &, // out_value
 823                         const CuMatrixBase<BaseFloat> &out_deriv,
 824                         Component *to_update,
 825                         CuMatrixBase<BaseFloat> *in_deriv) const;
 826
 827
 828   virtual Component* Copy() const;
 829   virtual void Read(std::istream &is, bool binary);
 830   virtual void Write(std::ostream &os, bool binary) const;
 831
 832   // Function to provide access to linear_params_.
 833   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 834  protected:
 835   friend class AffineComponent;
 836   CuMatrix<BaseFloat> linear_params_;
 837   CuVector<BaseFloat> bias_params_;
 838
 839   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
 840 };
 841
 842 /// SumGroupComponent is used to sum up groups of posteriors.
 843 /// It's used to introduce a kind of Gaussian-mixture-model-like
 844 /// idea into neural nets.  This is basically a degenerate case of
 845 /// MixtureProbComponent; we had to implement it separately to
 846 /// be efficient for CUDA (we can use this one regardless whether
 847 /// we have CUDA or not; it's the normal case we want anyway).
 848 ///
 849 /// There are two forms of initialization in a config file: one
 850 /// where the number of elements are specified for each group
 851 /// individually as a vector, and one where only the total input
 852 /// dimension and the output dimension (number of groups) is specified.
 853 /// The second is used when all groups have the same size.
 854 class SumGroupComponent: public Component {
 855 public:
 856   virtual int32 InputDim() const { return input_dim_; }
 857   virtual int32 OutputDim() const { return output_dim_; }
 858   void Init(const std::vector<int32> &sizes); // the vector is of the input dim
 859                                               // (>= 1) for each output dim.
 860   void Init(int32 input_dim, int32 output_dim);
 861   void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
 862                                                   // each output-dim, how many
 863                                                   // inputs were summed over.
 864   virtual void InitFromConfig(ConfigLine *cfl);
 865   SumGroupComponent() { }
 866   virtual std::string Type() const { return "SumGroupComponent"; }
 867   virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
 868   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 869                          const CuMatrixBase<BaseFloat> &in,
 870                          CuMatrixBase<BaseFloat> *out) const;
 871   virtual void Backprop(const std::string &debug_info,
 872                         const ComponentPrecomputedIndexes *indexes,
 873                         const CuMatrixBase<BaseFloat> &in_value,
 874                         const CuMatrixBase<BaseFloat> &, // out_value
 875                         const CuMatrixBase<BaseFloat> &out_deriv,
 876                         Component *to_update,
 877                         CuMatrixBase<BaseFloat> *in_deriv) const;
 878   virtual Component* Copy() const;
 879   virtual void Read(std::istream &is, bool binary);
 880   virtual void Write(std::ostream &os, bool binary) const;
 881
 882 private:
 883   KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
 884   // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
 885   // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
 886   CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
 887                                // index.
 888   CuArray<int32> reverse_indexes_; // for each input index, the output index.
 889   int32 input_dim_;
 890   int32 output_dim_;
 891 };
 892
 893
 894 /// FixedScaleComponent applies a fixed per-element scale; it's similar
 895 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
 896 /// model conversion).
 897 class FixedScaleComponent: public Component {
 898  public:
 899   FixedScaleComponent() { }
 900   virtual std::string Type() const { return "FixedScaleComponent"; }
 901   virtual std::string Info() const;
 902   virtual int32 Properties() const {
 903     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
 904   }
 905
 906   void Init(const CuVectorBase<BaseFloat> &scales);
 907
 908   // The ConfigLine cfl contains only the option scales=<string>,
 909   // where the string is the filename of a Kaldi-format matrix to read.
 910   virtual void InitFromConfig(ConfigLine *cfl);
 911
 912   virtual int32 InputDim() const { return scales_.Dim(); }
 913   virtual int32 OutputDim() const { return scales_.Dim(); }
 914
 915   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 916                          const CuMatrixBase<BaseFloat> &in,
 917                          CuMatrixBase<BaseFloat> *out) const;
 918   virtual void Backprop(const std::string &debug_info,
 919                         const ComponentPrecomputedIndexes *indexes,
 920                         const CuMatrixBase<BaseFloat> &, // in_value
 921                         const CuMatrixBase<BaseFloat> &, // out_value
 922                         const CuMatrixBase<BaseFloat> &out_deriv,
 923                         Component *, // to_update
 924                         CuMatrixBase<BaseFloat> *in_deriv) const;
 925   virtual Component* Copy() const;
 926   virtual void Read(std::istream &is, bool binary);
 927   virtual void Write(std::ostream &os, bool binary) const;
 928
 929  protected:
 930   friend class AffineComponent;  // necessary for collapse
 931   CuVector<BaseFloat> scales_;
 932   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
 933 };
 934
 935
 936 /// FixedBiasComponent applies a fixed per-element bias; it's similar
 937 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
 938 /// model conversion.
 939 class FixedBiasComponent: public Component {
 940  public:
 941   FixedBiasComponent() { }
 942   virtual std::string Type() const { return "FixedBiasComponent"; }
 943   virtual std::string Info() const;
 944
 945   virtual int32 Properties() const {
 946     return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
 947   }
 948
 949   void Init(const CuVectorBase<BaseFloat> &scales);
 950
 951   // The ConfigLine cfl contains only the option bias=<string>,
 952   // where the string is the filename of a Kaldi-format matrix to read.
 953   virtual void InitFromConfig(ConfigLine *cfl);
 954   virtual int32 InputDim() const { return bias_.Dim(); }
 955   virtual int32 OutputDim() const { return bias_.Dim(); }
 956   using Component::Propagate; // to avoid name hiding
 957   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 958                          const CuMatrixBase<BaseFloat> &in,
 959                          CuMatrixBase<BaseFloat> *out) const;
 960   virtual void Backprop(const std::string &debug_info,
 961                         const ComponentPrecomputedIndexes *indexes,
 962                         const CuMatrixBase<BaseFloat> &, // in_value,
 963                         const CuMatrixBase<BaseFloat> &, // out_value
 964                         const CuMatrixBase<BaseFloat> &out_deriv,
 965                         Component *, // to_update
 966                         CuMatrixBase<BaseFloat> *in_deriv) const;
 967   virtual Component* Copy() const;
 968   virtual void Read(std::istream &is, bool binary);
 969   virtual void Write(std::ostream &os, bool binary) const;
 970
 971  protected:
 972   CuVector<BaseFloat> bias_;
 973   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
 974 };
 975
 976 // NoOpComponent just duplicates its input.  We don't anticipate this being used
 977 // very often, but it may sometimes make your life easier
 978 class NoOpComponent: public NonlinearComponent {
 979  public:
 980   explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
 981   NoOpComponent() { }
 982   virtual std::string Type() const { return "NoOpComponent"; }
 983   virtual int32 Properties() const {
 984     return kSimpleComponent|kLinearInInput|kPropagateInPlace;
 985   }
 986   virtual Component* Copy() const { return new NoOpComponent(*this); }
 987   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 988                          const CuMatrixBase<BaseFloat> &in,
 989                          CuMatrixBase<BaseFloat> *out) const;
 990   virtual void Backprop(const std::string &debug_info,
 991                         const ComponentPrecomputedIndexes *indexes,
 992                         const CuMatrixBase<BaseFloat> &, //in_value
 993                         const CuMatrixBase<BaseFloat> &, // out_value,
 994                         const CuMatrixBase<BaseFloat> &out_deriv,
 995                         Component *to_update,
 996                         CuMatrixBase<BaseFloat> *in_deriv) const;
 997  private:
 998   NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
 999 };
1000
1001 // ClipGradientComponent just duplicates its input, but clips gradients
1002 // during backpropagation if they cross a predetermined threshold.
1003 // This component will be used to prevent gradient explosion problem in
1004 // recurrent neural networks
1005 class ClipGradientComponent: public Component {
1006  public:
1007   ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1008                         bool norm_based_clipping,
1009                         BaseFloat self_repair_clipped_proportion_threshold,
1010                         BaseFloat self_repair_target,
1011                         BaseFloat self_repair_scale,
1012                         int32 num_clipped,
1013                         int32 count,
1014                         int32 num_self_repaired,
1015                         int32 num_backpropped) {
1016     Init(dim, clipping_threshold, norm_based_clipping,
1017          self_repair_clipped_proportion_threshold,
1018          self_repair_target,
1019          self_repair_scale,
1020          num_clipped, count,
1021          num_self_repaired, num_backpropped);}
1022
1023   ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1024     norm_based_clipping_(false),
1025     self_repair_clipped_proportion_threshold_(1.0),
1026     self_repair_target_(0.0),
1027     self_repair_scale_(0.0),
1028     num_clipped_(0), count_(0),
1029     num_self_repaired_(0), num_backpropped_(0) { }
1030
1031   virtual int32 InputDim() const { return dim_; }
1032   virtual int32 OutputDim() const { return dim_; }
1033   virtual void InitFromConfig(ConfigLine *cfl);
1034   void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1035             BaseFloat self_repair_clipped_proportion_threshold,
1036             BaseFloat self_repair_target,
1037             BaseFloat self_repair_scale,
1038             int32 num_clipped, int32 count,
1039             int32 num_self_repaired, int32 num_backpropped);
1040
1041   virtual std::string Type() const { return "ClipGradientComponent"; }
1042
1043   virtual int32 Properties() const {
1044     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1045            kBackpropNeedsInput;
1046   }
1047
1048   virtual void ZeroStats();
1049
1050   virtual Component* Copy() const {
1051     return new ClipGradientComponent(dim_,
1052                                      clipping_threshold_,
1053                                      norm_based_clipping_,
1054                                      self_repair_clipped_proportion_threshold_,
1055                                      self_repair_target_,
1056                                      self_repair_scale_,
1057                                      num_clipped_,
1058                                      count_,
1059                                      num_self_repaired_,
1060                                      num_backpropped_);}
1061
1062   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1063                          const CuMatrixBase<BaseFloat> &in,
1064                          CuMatrixBase<BaseFloat> *out) const;
1065   virtual void Backprop(const std::string &debug_info,
1066                         const ComponentPrecomputedIndexes *indexes,
1067                         const CuMatrixBase<BaseFloat> &in_value,
1068                         const CuMatrixBase<BaseFloat> &, // out_value,
1069                         const CuMatrixBase<BaseFloat> &out_deriv,
1070                         Component *to_update,
1071                         CuMatrixBase<BaseFloat> *in_deriv) const;
1072
1073   virtual void Scale(BaseFloat scale);
1074   virtual void Add(BaseFloat alpha, const Component &other);
1075   virtual void Read(std::istream &is, bool binary); // This Read function
1076   // requires that the Component has the correct type.
1077   /// Write component to stream
1078   virtual void Write(std::ostream &os, bool binary) const;
1079   virtual std::string Info() const;
1080   virtual ~ClipGradientComponent() {
1081     if (num_self_repaired_ > 0)
1082       KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1083                 << ")'s self-repair was activated " << num_self_repaired_
1084                 << " time(s) out of " << num_backpropped_
1085                 << " times of calling Backprop() in this training job.";
1086   }
1087  private:
1088   int32 dim_;  // input/output dimension
1089   BaseFloat clipping_threshold_;  // threshold to be used for clipping
1090                                   // could correspond to max-row-norm (if
1091                                   // norm_based_clipping_ == true) or
1092                                   // max-absolute-value (otherwise)
1093   bool norm_based_clipping_;  // if true the max-row-norm will be clipped
1094                               // else element-wise absolute value clipping is
1095                               // done
1096
1097   // some configuration values relating to self-repairing.
1098   BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1099                                                        // clipped-proportion
1100                                                        // for self-repair to be
1101                                                        // activated
1102   BaseFloat self_repair_target_; // the target value towards which self-repair
1103                                  // is trying to set for in-deriv
1104   BaseFloat self_repair_scale_;  // constant scaling the self-repair vector
1105   std::string debug_info_;   // component-node name, used in the destructor to
1106                              // print out stats of self-repair
1107
1108   // this function is called from Backprop code, and only does something if the
1109   // self-repair-scale config value is set and the current clipped proportion
1110   // exceeds the threshold. What it does is to add a term to in-deriv that
1111   // forces the input to the ClipGradientComponent to be close to some small
1112   // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1113   // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1114   // small, the parameters on the path will also tend to be small, which may
1115   // help tamp down the divergence caused by gradient explosion.
1116   void RepairGradients(const std::string &debug_info,
1117                        const CuMatrixBase<BaseFloat> &in_value,
1118                        CuMatrixBase<BaseFloat> *in_deriv,
1119                        ClipGradientComponent *to_update) const;
1120
1121   ClipGradientComponent &operator =
1122       (const ClipGradientComponent &other); // Disallow.
1123
1124  protected:
1125   // variables to store stats
1126   // An element corresponds to rows of derivative matrix, when
1127   // norm_based_clipping_ is true,
1128   // else it corresponds to each element of the derivative matrix
1129   // Note: no stats are stored when norm_based_clipping_ is false
1130   int32 num_clipped_;  // number of elements which were clipped
1131   int32 count_;  // number of elements which were processed
1132   int32 num_self_repaired_; // number of times self-repair is activated
1133   int32 num_backpropped_; //number of times backprop is called
1134
1135 };
1136
1137 /** PermuteComponent changes the order of the columns (i.e. the feature or
1138     activation dimensions).  Output dimension i is mapped to input dimension
1139     column_map_[i], so it's like doing:
1140       for each row:
1141         for each feature/activation dimension i:
1142           output(row, i) = input(row, column_map_[i]).
1143
1144 */
1145 class PermuteComponent: public Component {
1146  public:
1147   PermuteComponent()  {}
1148   PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1149
1150   virtual int32 InputDim() const { return column_map_.Dim(); }
1151   virtual int32 OutputDim() const { return column_map_.Dim(); }
1152   virtual void InitFromConfig(ConfigLine *cfl);
1153   void Init(const std::vector<int32> &column_map);
1154
1155   virtual std::string Type() const { return "PermuteComponent"; }
1156
1157   virtual int32 Properties() const {
1158     return kSimpleComponent|kLinearInInput;
1159   }
1160
1161   virtual void ZeroStats() {}
1162
1163   virtual Component* Copy() const;
1164
1165   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1166                          const CuMatrixBase<BaseFloat> &in,
1167                          CuMatrixBase<BaseFloat> *out) const;
1168   virtual void Backprop(const std::string &debug_info,
1169                         const ComponentPrecomputedIndexes *indexes,
1170                         const CuMatrixBase<BaseFloat> &, //in_value
1171                         const CuMatrixBase<BaseFloat> &, // out_value,
1172                         const CuMatrixBase<BaseFloat> &out_deriv,
1173                         Component *to_update,
1174                         CuMatrixBase<BaseFloat> *in_deriv) const;
1175
1176   virtual void Scale(BaseFloat scale) {}
1177   virtual void Add(BaseFloat alpha, const Component &other) {}
1178   virtual void Read(std::istream &is, bool binary); // This Read function
1179   // requires that the Component has the correct type.
1180   /// Write component to stream
1181   virtual void Write(std::ostream &os, bool binary) const;
1182   virtual std::string Info() const;
1183  private:
1184   // computes the reverse column map.  Must not be called if column_map_.Dim()
1185   // == 0
1186   void ComputeReverseColumnMap();
1187   CuArray<int32> column_map_;
1188   // the following is a derived variable, not written to disk.
1189   // It is used in backprop.
1190   CuArray<int32> reverse_column_map_;
1191   PermuteComponent &operator =
1192       (const PermuteComponent &other); // Disallow.
1193 };
1194
1195
1196
1197
1198 // PerElementScaleComponent scales each dimension of its input with a separate
1199 // trainable scale; it's like a linear component with a diagonal matrix.
1200 class PerElementScaleComponent: public UpdatableComponent {
1201  public:
1202   virtual int32 InputDim() const { return scales_.Dim(); }
1203   virtual int32 OutputDim() const { return scales_.Dim(); }
1204
1205   virtual std::string Info() const;
1206   virtual void InitFromConfig(ConfigLine *cfl);
1207
1208   PerElementScaleComponent() { } // use Init to really initialize.
1209   virtual std::string Type() const { return "PerElementScaleComponent"; }
1210   virtual int32 Properties() const {
1211     return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1212         kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1213   }
1214
1215   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1216                          const CuMatrixBase<BaseFloat> &in,
1217                          CuMatrixBase<BaseFloat> *out) const;
1218   virtual void Backprop(const std::string &debug_info,
1219                         const ComponentPrecomputedIndexes *indexes,
1220                         const CuMatrixBase<BaseFloat> &in_value,
1221                         const CuMatrixBase<BaseFloat> &, // out_value
1222                         const CuMatrixBase<BaseFloat> &out_deriv,
1223                         Component *to_update,
1224                         CuMatrixBase<BaseFloat> *in_deriv) const;
1225
1226   virtual void Read(std::istream &is, bool binary);
1227   virtual void Write(std::ostream &os, bool binary) const;
1228
1229   virtual Component* Copy() const;
1230
1231
1232   // Some functions from base-class UpdatableComponent.
1233   virtual void Scale(BaseFloat scale);
1234   virtual void Add(BaseFloat alpha, const Component &other);
1235   virtual void PerturbParams(BaseFloat stddev);
1236   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1237   virtual int32 NumParameters() const;
1238   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1239   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1240
1241   // Some functions that are specific to this class.
1242   explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1243
1244   void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1245   void Init(std::string vector_filename);
1246
1247  protected:
1248   friend class AffineComponent;  // necessary for collapse
1249   // This function Update() is for extensibility; child classes may override
1250   // this, e.g. for natural gradient update.
1251   virtual void Update(
1252       const std::string &debug_info,
1253       const CuMatrixBase<BaseFloat> &in_value,
1254       const CuMatrixBase<BaseFloat> &out_deriv) {
1255     UpdateSimple(in_value, out_deriv);
1256   }
1257   // UpdateSimple is used when *this is a gradient.  Child classes may override
1258   // this if needed, but typically won't need to.
1259   virtual void UpdateSimple(
1260       const CuMatrixBase<BaseFloat> &in_value,
1261       const CuMatrixBase<BaseFloat> &out_deriv);
1262
1263   const PerElementScaleComponent &operator
1264       = (const PerElementScaleComponent &other); // Disallow.
1265   CuVector<BaseFloat> scales_;
1266 };
1267
1268
1269 // PerElementOffsetComponent offsets each dimension of its input with a separate
1270 // trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
1271 class PerElementOffsetComponent: public UpdatableComponent {
1272  public:
1273   virtual int32 InputDim() const { return offsets_.Dim(); }
1274   virtual int32 OutputDim() const { return offsets_.Dim(); }
1275
1276   virtual std::string Info() const;
1277   virtual void InitFromConfig(ConfigLine *cfl);
1278
1279   PerElementOffsetComponent() { } // use Init to really initialize.
1280   virtual std::string Type() const { return "PerElementOffsetComponent"; }
1281   virtual int32 Properties() const {
1282     return kSimpleComponent|kUpdatableComponent|
1283            kBackpropInPlace|kPropagateInPlace;
1284   }
1285
1286   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1287                          const CuMatrixBase<BaseFloat> &in,
1288                          CuMatrixBase<BaseFloat> *out) const;
1289   virtual void Backprop(const std::string &debug_info,
1290                         const ComponentPrecomputedIndexes *indexes,
1291                         const CuMatrixBase<BaseFloat> &, // in_value
1292                         const CuMatrixBase<BaseFloat> &, // out_value
1293                         const CuMatrixBase<BaseFloat> &out_deriv,
1294                         Component *to_update,
1295                         CuMatrixBase<BaseFloat> *in_deriv) const;
1296
1297   virtual void Read(std::istream &is, bool binary);
1298   virtual void Write(std::ostream &os, bool binary) const;
1299
1300   virtual Component* Copy() const;
1301
1302
1303   // Some functions from base-class UpdatableComponent.
1304   virtual void Scale(BaseFloat scale);
1305   virtual void Add(BaseFloat alpha, const Component &other);
1306   virtual void PerturbParams(BaseFloat stddev);
1307   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1308   virtual int32 NumParameters() const;
1309   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1310   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1311
1312   // Some functions that are specific to this class.
1313   explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1314
1315   void Init(int32 dim, BaseFloat param_mean,
1316             BaseFloat param_stddev);
1317   void Init(std::string vector_filename);
1318
1319  protected:
1320   const PerElementOffsetComponent &operator
1321       = (const PerElementOffsetComponent &other); // Disallow.
1322   CuVector<BaseFloat> offsets_;
1323 };
1324
1325
1326 // ConstantFunctionComponent returns constant function of its input,
1327 // i.e. its output does not depend on its input.  It is the same as
1328 // an affine component with the linear term fixed at zero.
1329 // It is optionally trainable, and optionally you can use natural
1330 // gradient.  The input is required only because it's more convenient
1331 // to make SimpleComponents [but see ConstantComponent, which requires
1332 // no inputs].
1333 class ConstantFunctionComponent: public UpdatableComponent {
1334  public:
1335   virtual int32 InputDim() const { return input_dim_; }
1336   virtual int32 OutputDim() const { return output_.Dim(); }
1337
1338   virtual std::string Info() const;
1339   // possible parameter values with their defaults:
1340   // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1341   // output-mean=0 output-stddev=0
1342   virtual void InitFromConfig(ConfigLine *cfl);
1343
1344   ConstantFunctionComponent();
1345
1346   ConstantFunctionComponent(const ConstantFunctionComponent &other);
1347
1348   virtual std::string Type() const { return "ConstantFunctionComponent"; }
1349   virtual int32 Properties() const {
1350     return kSimpleComponent|
1351         (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1352         (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1353         kBackpropAdds;
1354   }
1355   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1356                          const CuMatrixBase<BaseFloat> &in,
1357                          CuMatrixBase<BaseFloat> *out) const;
1358   virtual void Backprop(const std::string &debug_info,
1359                         const ComponentPrecomputedIndexes *indexes,
1360                         const CuMatrixBase<BaseFloat> &, // in_value
1361                         const CuMatrixBase<BaseFloat> &, // out_value
1362                         const CuMatrixBase<BaseFloat> &out_deriv,
1363                         Component *to_update,
1364                         CuMatrixBase<BaseFloat> *in_deriv) const;
1365
1366   virtual void Read(std::istream &is, bool binary);
1367   virtual void Write(std::ostream &os, bool binary) const;
1368
1369   virtual Component* Copy() const;
1370
1371   // Some functions from base-class UpdatableComponent.
1372   virtual void Scale(BaseFloat scale);
1373   virtual void Add(BaseFloat alpha, const Component &other);
1374   virtual void PerturbParams(BaseFloat stddev);
1375   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1376   virtual int32 NumParameters() const;
1377   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1378   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1379  private:
1380   int32 input_dim_;
1381   // the output value-- a vector.
1382   CuVector<BaseFloat> output_;
1383
1384   bool is_updatable_;
1385   // if true, and if updatable, do natural-gradient update.
1386   bool use_natural_gradient_;
1387   OnlineNaturalGradient preconditioner_;
1388
1389   const ConstantFunctionComponent &operator
1390   = (const ConstantFunctionComponent &other); // Disallow.
1391 };
1392
1393
1394
1395 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1396 // it uses a natural gradient update for the per-element scales, and enforces a
1397 // maximum amount of change per minibatch, for stability.
1398 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1399  public:
1400
1401   virtual std::string Info() const;
1402
1403   virtual void InitFromConfig(ConfigLine *cfl);
1404
1405   NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1406   virtual std::string Type() const {
1407     return "NaturalGradientPerElementScaleComponent";
1408   }
1409
1410   virtual void Read(std::istream &is, bool binary);
1411   virtual void Write(std::ostream &os, bool binary) const;
1412
1413   virtual Component* Copy() const;
1414
1415   // Some functions that are specific to this class:
1416   explicit NaturalGradientPerElementScaleComponent(
1417       const NaturalGradientPerElementScaleComponent &other);
1418
1419   void Init(int32 dim, BaseFloat param_mean,
1420             BaseFloat param_stddev, int32 rank, int32 update_period,
1421             BaseFloat num_samples_history, BaseFloat alpha,
1422             BaseFloat max_change_per_minibatch);
1423   void Init(std::string vector_filename,
1424             int32 rank, int32 update_period, BaseFloat num_samples_history,
1425             BaseFloat alpha, BaseFloat max_change_per_minibatch);
1426
1427  private:
1428   // configuration value for imposing max-change...
1429   BaseFloat max_change_per_minibatch_;
1430
1431   // unlike the NaturalGradientAffineComponent, there is only one dimension to
1432   // consider as the parameters are a vector not a matrix, so we only need one
1433   // preconditioner.
1434   // The preconditioner stores its own configuration values; we write and read
1435   // these, but not the preconditioner object itself.
1436   OnlineNaturalGradient preconditioner_;
1437
1438   // Override of the parent-class Update() function, called only
1439   // if this->is_gradient_ = false; this implements the natural
1440   // gradient update.
1441   virtual void Update(
1442       const std::string &debug_info,
1443       const CuMatrixBase<BaseFloat> &in_value,
1444       const CuMatrixBase<BaseFloat> &out_deriv);
1445
1446   const NaturalGradientPerElementScaleComponent &operator
1447       = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1448 };
1449
1450 /**
1451  * ConvolutionalComponent implements 2d-convolution.
1452  * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1453  * 2 dimensions as it has same size as the input along the 3rd dimension.
1454  * Input : A matrix where each row is a  vectorized 3D-tensor.
1455  *        The 3D tensor has dimensions
1456  *        x: (e.g. time)
1457  *        y: (e.g. frequency)
1458  *        z: (e.g. channels like features/delta/delta-delta)
1459  *
1460  *        The component supports input vectorizations of type zyx and yzx.
1461  *        The default vectorization type is zyx.
1462  *        e.g. for input vectorization of type zyx the input is vectorized by
1463  *        spanning axes z, y and x of the tensor in that order.
1464  *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1465  *        the zyx vectorized input looks like
1466  *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1467  *
1468  *
1469  * Output : The output is also a 3D tensor vectorized in the zyx format.
1470  *          The channel axis (z) in the output corresponds to the output of
1471  *          different filters. The first channel corresponds to the first filter
1472  *          i.e., first row of the filter_params_ matrix.
1473  *
1474  * Note: The component has to support yzx input vectorization as the binaries
1475  * like add-deltas generate yz vectorized output. These input vectors are
1476  * concatenated using the Append descriptor across time steps to form a yzx
1477  * vectorized 3D tensor input.
1478  * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1479  *
1480  *
1481  * For information on the hyperparameters and parameters of this component see
1482  * the variable declarations.
1483  *
1484  * Propagation:
1485  * ------------
1486  * Convolution operation consists of a dot-products between the filter tensor
1487  * and input tensor patch, for various shifts of filter tensor along the x and y
1488  * axes input tensor. (Note: there is no shift along z-axis as the filter and
1489  * input tensor have same size along this axis).
1490  *
1491  * For a particular shift (i,j) of the filter tensor
1492  * along input tensor dimensions x and y, the elements of the input tensor which
1493  * overlap with the filter form the input tensor patch. This patch is vectorized
1494  * in zyx format. All the patches corresponding to various samples in the
1495  * mini-batch are stacked into a matrix, where each row corresponds to one
1496  * patch. Let this matrix be represented by X_{i,j}. The dot products with
1497  * various filters are computed simultaneously by computing the matrix product
1498  * with the filter_params_ matrix (W)
1499  * Y_{i,j} = X_{i,j}*W^T.
1500  * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1501  *
1502  * All the matrix products corresponding to various shifts (i,j) of the
1503  * filter tensor are computed simultaneously using the AddMatMatBatched
1504  * call of CuMatrixBase class.
1505  *
1506  * BackPropagation:
1507  * ----------------
1508  *  Backpropagation to compute the input derivative (\nabla X_{i,j})
1509  *  consists of the a series of matrix products.
1510  *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1511  *   output derivative for a particular shift of the filter.
1512  *
1513  *   Once again these matrix products are computed simultaneously.
1514  *
1515  * Update:
1516  * -------
1517  *  The weight gradient is computed as
1518  *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1519  *
1520  */
1521 class ConvolutionComponent: public UpdatableComponent {
1522  public:
1523   enum TensorVectorizationType  {
1524     kYzx = 0,
1525     kZyx = 1
1526   };
1527
1528   ConvolutionComponent();
1529   // constructor using another component
1530   ConvolutionComponent(const ConvolutionComponent &component);
1531   // constructor using parameters
1532   ConvolutionComponent(
1533     const CuMatrixBase<BaseFloat> &filter_params,
1534     const CuVectorBase<BaseFloat> &bias_params,
1535     int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1536     int32 filt_x_dim, int32 filt_y_dim,
1537     int32 filt_x_step, int32 filt_y_step,
1538     TensorVectorizationType input_vectorization,
1539     BaseFloat learning_rate);
1540
1541   virtual int32 InputDim() const;
1542   virtual int32 OutputDim() const;
1543
1544   virtual std::string Info() const;
1545   virtual void InitFromConfig(ConfigLine *cfl);
1546   virtual std::string Type() const { return "ConvolutionComponent"; }
1547   virtual int32 Properties() const {
1548     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1549            kBackpropAdds|kPropagateAdds;
1550   }
1551
1552   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1553                          const CuMatrixBase<BaseFloat> &in,
1554                          CuMatrixBase<BaseFloat> *out) const;
1555   virtual void Backprop(const std::string &debug_info,
1556                         const ComponentPrecomputedIndexes *indexes,
1557                         const CuMatrixBase<BaseFloat> &in_value,
1558                         const CuMatrixBase<BaseFloat> &, // out_value,
1559                         const CuMatrixBase<BaseFloat> &out_deriv,
1560                         Component *to_update_in,
1561                         CuMatrixBase<BaseFloat> *in_deriv) const;
1562   void Update(const std::string &debug_info,
1563               const CuMatrixBase<BaseFloat> &in_value,
1564               const CuMatrixBase<BaseFloat> &out_deriv,
1565               const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1566
1567
1568   virtual void Read(std::istream &is, bool binary);
1569   virtual void Write(std::ostream &os, bool binary) const;
1570
1571   virtual Component* Copy() const;
1572
1573   // Some functions from base-class UpdatableComponent.
1574   virtual void Scale(BaseFloat scale);
1575   virtual void Add(BaseFloat alpha, const Component &other);
1576   virtual void PerturbParams(BaseFloat stddev);
1577   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1578   virtual int32 NumParameters() const;
1579   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1580   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1581
1582   // Some functions that are specific to this class.
1583   void SetParams(const VectorBase<BaseFloat> &bias,
1584                  const MatrixBase<BaseFloat> &filter);
1585   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1586   const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1587   void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1588             int32 filt_x_dim, int32 filt_y_dim,
1589             int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1590             TensorVectorizationType input_vectorization,
1591             BaseFloat param_stddev, BaseFloat bias_stddev);
1592   // there is no filt_z_dim parameter as the length of the filter along
1593   // z-dimension is same as the input
1594   void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1595             int32 filt_x_dim, int32 filt_y_dim,
1596             int32 filt_x_step, int32 filt_y_step,
1597             TensorVectorizationType input_vectorization,
1598             std::string matrix_filename);
1599
1600   // resize the component, setting the parameters to zero, while
1601   // leaving any other configuration values the same
1602   void Resize(int32 input_dim, int32 output_dim);
1603
1604   void Update(const std::string &debug_info,
1605               const CuMatrixBase<BaseFloat> &in_value,
1606               const CuMatrixBase<BaseFloat> &out_deriv);
1607
1608
1609  private:
1610   int32 input_x_dim_;   // size of the input along x-axis
1611                         // (e.g. number of time steps)
1612
1613   int32 input_y_dim_;   // size of input along y-axis
1614                         // (e.g. number of mel-frequency bins)
1615
1616   int32 input_z_dim_;   // size of input along z-axis
1617                         // (e.g. number of channels is 3 if the input has
1618                         // features + delta + delta-delta features
1619
1620   int32 filt_x_dim_;    // size of the filter along x-axis
1621
1622   int32 filt_y_dim_;    // size of the filter along y-axis
1623
1624   // there is no filt_z_dim_ as it is always assumed to be
1625   // the same as input_z_dim_
1626
1627   int32 filt_x_step_;   // the number of steps taken along x-axis of input
1628                         //  before computing the next dot-product
1629                         //  of filter and input
1630
1631   int32 filt_y_step_;   // the number of steps taken along y-axis of input
1632                         // before computing the next dot-product of the filter
1633                         // and input
1634
1635   // there is no filt_z_step_ as only dot product is possible along this axis
1636
1637   TensorVectorizationType input_vectorization_; // type of vectorization of the
1638   // input 3D tensor. Accepts zyx and yzx formats
1639
1640   CuMatrix<BaseFloat> filter_params_;
1641   // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1642   // where each row in the matrix corresponds to one filter.
1643   // The 3D filter tensor is vectorizedin zyx format.
1644   // The first row of the matrix corresponds to the first filter and so on.
1645   // Keep in mind the vectorization type and order of filters when using file
1646   // based initialization.
1647
1648   CuVector<BaseFloat> bias_params_;
1649   // the filter-specific bias vector (i.e., there is a seperate bias added
1650   // to the output of each filter).
1651   bool is_gradient_;
1652
1653   void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1654                            CuMatrix<BaseFloat> *patches) const;
1655   void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1656                                CuMatrixBase<BaseFloat> *in_deriv) const;
1657   const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1658 };
1659
1660
1661 // LstmNonlinearityComponent is a component that implements part of an LSTM, by
1662 // combining together the sigmoids and tanh's, plus some diagonal terms, into
1663 // a single block.
1664 // We will refer to the LSTM formulation used in
1665 //
1666 // Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1667 // by H. Sak et al,
1668 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1669 //
1670 // Suppose the cell dimension is C.  Then outside this component, we compute
1671 // the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1672 // matrix multiplication:
1673 //
1674 // i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1675 // f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1676 // c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1677 // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1678 //
1679 // The part of the computation that takes place in this component is as follows.
1680 // Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
1681 // c_{t-1}).  Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1682 //
1683 // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1684 //
1685 //
1686 // This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1687 // and w_o.
1688 //
1689 //
1690 // In the forward pass (Propagate), this component computes the following:
1691 //
1692 //    i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
1693 //    f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
1694 //    c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
1695 //    o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
1696 //    m_t = o_t * Tanh(c_t)                    (5)
1697 //   # note: the outputs are just c_t and m_t.
1698 //
1699 // The backprop is as you would think, but for the "self-repair" we need to pass
1700 // in additional vectors (of the same dim as the parameters of the layer) that
1701 // dictate whether or not we add an additional term to the backpropagated
1702 // derivatives.  (This term helps force the input to the nonlinearities into the
1703 // range where the derivatives are not too small).
1704 //
1705 // This component stores stats of the same form as are normally stored by the
1706 // StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1707 // activations and derivatives, but this is done inside the Backprop() functions.
1708 // [the StoreStats() functions don't take the input data as an argument, so
1709 // storing this data that way is impossible, and anyway it's more efficient to
1710 // do it as part of backprop.]
1711 class LstmNonlinearityComponent: public UpdatableComponent {
1712  public:
1713
1714   virtual int32 InputDim() const;
1715   virtual int32 OutputDim() const;
1716   virtual std::string Info() const;
1717   virtual void InitFromConfig(ConfigLine *cfl);
1718   LstmNonlinearityComponent() { } // use Init to really initialize.
1719   virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1720   virtual int32 Properties() const {
1721     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1722   }
1723
1724   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1725                          const CuMatrixBase<BaseFloat> &in,
1726                          CuMatrixBase<BaseFloat> *out) const;
1727   virtual void Backprop(const std::string &debug_info,
1728                         const ComponentPrecomputedIndexes *indexes,
1729                         const CuMatrixBase<BaseFloat> &in_value,
1730                         const CuMatrixBase<BaseFloat> &, // out_value,
1731                         const CuMatrixBase<BaseFloat> &out_deriv,
1732                         Component *to_update_in,
1733                         CuMatrixBase<BaseFloat> *in_deriv) const;
1734
1735   virtual void Read(std::istream &is, bool binary);
1736   virtual void Write(std::ostream &os, bool binary) const;
1737
1738   virtual Component* Copy() const;
1739
1740   // Some functions from base-class UpdatableComponent.
1741   virtual void Scale(BaseFloat scale);
1742   virtual void Add(BaseFloat alpha, const Component &other);
1743   virtual void PerturbParams(BaseFloat stddev);
1744   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1745   virtual int32 NumParameters() const;
1746   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1747   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1748   virtual void ZeroStats();
1749
1750   // Some functions that are specific to this class:
1751   explicit LstmNonlinearityComponent(
1752       const LstmNonlinearityComponent &other);
1753
1754   void Init(int32 cell_dim, BaseFloat param_stddev,
1755             BaseFloat tanh_self_repair_threshold,
1756             BaseFloat sigmoid_self_repair_threshold,
1757             BaseFloat self_repair_scale);
1758
1759   void Init(std::string vector_filename,
1760             int32 rank, int32 update_period, BaseFloat num_samples_history,
1761             BaseFloat alpha, BaseFloat max_change_per_minibatch);
1762
1763  private:
1764
1765   // Initializes the natural-gradient object with the configuration we
1766   // use for this object, which for now is hardcoded at the C++ level.
1767   void InitNaturalGradient();
1768
1769
1770   // Notation: C is the cell dimension; it equals params_.NumCols().
1771
1772   // The dimension of the parameter matrix is (3 x C);
1773   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1774   CuMatrix<BaseFloat> params_;
1775
1776   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1777   // equations (1) through (5), this is the sum of the values of the nonliearities
1778   // (used for diagnostics only).  It is comparable to value_sum_ vector
1779   // in base-class NonlinearComponent.
1780   CuMatrix<double> value_sum_;
1781
1782   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1783   // equations (1) through (5), this is the sum of the derivatives of the
1784   // nonliearities (used for diagnostics and to control self-repair).  It is
1785   // comparable to the deriv_sum_ vector in base-class
1786   // NonlinearComponent.
1787   CuMatrix<double> deriv_sum_;
1788
1789   // This matrix has dimension 10.  The contents are a block of 5 self-repair
1790   // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1791   // self-repair scales (typically all 0.00001).  These are for each of the 5
1792   // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1793   // more info).
1794   CuVector<BaseFloat> self_repair_config_;
1795
1796   // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
1797   // component (see comments in cu-math.h for more info), it contains the total,
1798   // over all frames represented in count_, of the number of dimensions that
1799   // were subject to self_repair.  To get the self-repair proportion you should
1800   // divide by (count_ times cell_dim_).
1801   CuVector<double> self_repair_total_;
1802
1803   // The total count (number of frames) corresponding to the stats in value_sum_
1804   // and deriv_sum_.
1805   double count_;
1806
1807   // Preconditioner for the parameters of this component [operates in the space
1808   // of dimension C].
1809   // The preconditioner stores its own configuration values; we write and read
1810   // these, but not the preconditioner object itself.
1811   OnlineNaturalGradient preconditioner_;
1812
1813   const LstmNonlinearityComponent &operator
1814       = (const LstmNonlinearityComponent &other); // Disallow.
1815 };
1816
1817
1818
1819
1820 /*
1821  * MaxPoolingComponent :
1822  * Maxpooling component was firstly used in ConvNet for selecting an
1823  * representative activation in an area. It inspired Maxout nonlinearity.
1824  * Each output element of this component is the maximum of a block of
1825  * input elements where the block has a 3D dimension (pool_x_size_,
1826  * pool_y_size_, pool_z_size_).
1827  * Blocks could overlap if the shift value on any axis is smaller
1828  * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
1829  * If the shift values are euqal to their pool size, there is no
1830  * overlap; while if they all equal 1, the blocks overlap to
1831  * the greatest possible extent.
1832  *
1833  * This component is designed to be used after a ConvolutionComponent
1834  * so that the input matrix is propagated from a 2d-convolutional layer.
1835  * This component implements 3d-maxpooling which performs
1836  * max pooling along the three axes.
1837  * Input : A matrix where each row is a vectorized 3D-tensor.
1838  *        The 3D tensor has dimensions
1839  *        x: (e.g. time)
1840  *        y: (e.g. frequency)
1841  *        z: (e.g. channels like number of filters in the ConvolutionComponent)
1842  *
1843  *        The component assumes input vectorizations of type zyx
1844  *        which is the default output vectorization type of a ConvolutionComponent.
1845  *        e.g. for input vectorization of type zyx the input is vectorized by
1846  *        spanning axes z, y and x of the tensor in that order.
1847  *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1848  *        the zyx vectorized input looks like
1849  *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1850  *
1851  * Output : The output is also a 3D tensor vectorized in the zyx format.
1852  *
1853  * For information on the hyperparameters and parameters of this component see
1854  * the variable declarations.
1855  *
1856  *
1857  */
1858
1859 class MaxpoolingComponent: public Component {
1860  public:
1861
1862   MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
1863                            pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
1864                            pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
1865   // constructor using another component
1866   MaxpoolingComponent(const MaxpoolingComponent &component);
1867
1868   virtual int32 InputDim() const;
1869   virtual int32 OutputDim() const;
1870   virtual void Check() const;
1871
1872   virtual std::string Info() const;
1873   virtual void InitFromConfig(ConfigLine *cfl);
1874   virtual std::string Type() const { return "MaxpoolingComponent"; }
1875   virtual int32 Properties() const {
1876     return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
1877            kBackpropAdds;
1878   }
1879
1880   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1881                          const CuMatrixBase<BaseFloat> &in,
1882                          CuMatrixBase<BaseFloat> *out) const;
1883   virtual void Backprop(const std::string &debug_info,
1884                         const ComponentPrecomputedIndexes *indexes,
1885                         const CuMatrixBase<BaseFloat> &in_value,
1886                         const CuMatrixBase<BaseFloat> &out_value,
1887                         const CuMatrixBase<BaseFloat> &out_deriv,
1888                         Component *, // to_update,
1889                         CuMatrixBase<BaseFloat> *in_deriv) const;
1890
1891   virtual void Read(std::istream &is, bool binary); // This Read function
1892   // requires that the Component has the correct type.
1893
1894   /// Write component to stream
1895   virtual void Write(std::ostream &os, bool binary) const;
1896   virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
1897
1898   void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1899                            CuMatrix<BaseFloat> *patches) const;
1900   void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1901                                CuMatrixBase<BaseFloat> *in_deriv) const;
1902
1903  protected:
1904   int32 input_x_dim_;   // size of the input along x-axis
1905   // (e.g. number of time steps)
1906   int32 input_y_dim_;   // size of input along y-axis
1907   // (e.g. number of mel-frequency bins)
1908   int32 input_z_dim_;   // size of input along z-axis
1909   // (e.g. number of filters in the ConvolutionComponent)
1910
1911   int32 pool_x_size_;    // size of the pooling window along x-axis
1912   int32 pool_y_size_;    // size of the pooling window along y-axis
1913   int32 pool_z_size_;    // size of the pooling window along z-axis
1914
1915   int32 pool_x_step_;   // the number of steps taken along x-axis of input
1916   //  before computing the next pool
1917   int32 pool_y_step_;   // the number of steps taken along y-axis of input
1918   // before computing the next pool
1919   int32 pool_z_step_;   // the number of steps taken along z-axis of input
1920   // before computing the next pool
1921
1922 };
1923
1924
1925 /**
1926    CompositeComponent is a component representing a sequence of
1927    [simple] components.  The config line would be something like the following
1928    (imagine this is all on one line):
1929
1930    component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
1931       component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
1932       component2='type=RectifiedLinearComponent dim=10000' \
1933       component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
1934
1935    The reason you might want to use this component, instead of directly using
1936    the same sequence of components in the config file, is to save GPU memory (at
1937    the expense of more compute)-- because doing it like this means we have to
1938    re-do parts of the forward pass in the backprop phase, but we avoid using
1939    much memory for very long (and you can make the memory usage very small by
1940    making max-rows-process small).  We inherit from UpdatableComponent just in
1941    case one or more of the components in the sequence are updatable.
1942
1943    It is an error to nest a CompositeComponent inside a CompositeComponent.
1944    The same effect can be accomplished by specifying a smaller max-rows-process
1945    in a single CompositeComponent.
1946  */
1947 class CompositeComponent: public UpdatableComponent {
1948  public:
1949   virtual int32 InputDim() const;
1950   virtual int32 OutputDim() const;
1951
1952   virtual std::string Info() const;
1953
1954   virtual void InitFromConfig(ConfigLine *cfl);
1955
1956   virtual Component* Copy() const;
1957
1958   CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
1959
1960   // Initialize from this list of components; takes ownership of the pointers.
1961   void Init(const std::vector<Component*> &components,
1962             int32 max_rows_process);
1963
1964   virtual std::string Type() const { return "CompositeComponent"; }
1965
1966   // The properties depend on the properties of the constituent components.  As
1967   // a special case, we never return kStoresStats in the properties: by default
1968   // we store things like activation stats (e.g. for nonlinear components like
1969   // ReLU) as part of the backprop.  This means we may wastefully store stats
1970   // even when not requested, but it does save time as a separate StoreStats()
1971   // call would involve propagating the internals.
1972   virtual int32 Properties() const;
1973
1974   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1975                          const CuMatrixBase<BaseFloat> &in,
1976                          CuMatrixBase<BaseFloat> *out) const;
1977   virtual void Backprop(const std::string &debug_info,
1978                         const ComponentPrecomputedIndexes *indexes,
1979                         const CuMatrixBase<BaseFloat> &in_value,
1980                         const CuMatrixBase<BaseFloat> &, // out_value
1981                         const CuMatrixBase<BaseFloat> &out_deriv,
1982                         Component *to_update,
1983                         CuMatrixBase<BaseFloat> *in_deriv) const;
1984
1985   // note, we don't implement StoreStats() as it would be inefficient.  Instead,
1986   // by default we call StoreStats() on all members that have the flag set,
1987   // inside the Backprop.
1988   virtual void ZeroStats();
1989
1990   virtual void Read(std::istream &is, bool binary);
1991   virtual void Write(std::ostream &os, bool binary) const;
1992
1993   // Don't implement Copy() at this level: implement it in the child class.
1994
1995   // Some functions from base-class UpdatableComponent.
1996   virtual void SetUnderlyingLearningRate(BaseFloat lrate);
1997   virtual void SetActualLearningRate(BaseFloat lrate);
1998   virtual void SetAsGradient();
1999   virtual void Scale(BaseFloat scale);
2000   virtual void Add(BaseFloat alpha, const Component &other);
2001   virtual void PerturbParams(BaseFloat stddev);
2002   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
2003   virtual int32 NumParameters() const;
2004   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
2005   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
2006
2007   // note: we dont implement the StoreStats function as it would be quite
2008   // expensive; instead, by default we call StoreStats() for any components that
2009   // want to store stats, as part of the backprop pass.  This is not 100% ideal
2010   // but it will usually do what you want.  We can revisit this later if needed.
2011
2012   // Functions to iterate over the internal components
2013
2014   int32 NumComponents() const { return components_.size();}
2015   /// Gets the ith component in this component.
2016   /// The ordering is the same as in the config line. The caller
2017   /// does not own the received component.
2018   const Component* GetComponent(int32 i) const;
2019   /// Sets the ith component. After this call, CompositeComponent owns
2020   /// the reference to the argument component. Frees the previous
2021   /// ith component.
2022   void SetComponent(int32 i, Component *component);
2023
2024   virtual ~CompositeComponent() { DeletePointers(&components_); }
2025  private:
2026   // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2027   // at the output of the i'th component.
2028   inline MatrixStrideType GetStrideType(int32 i) const;
2029
2030   // returns true if at least one of 'components_' returns the kUpdatable flag
2031   // in its flags.
2032   bool IsUpdatable() const;
2033
2034   // the maximum number of
2035   int32 max_rows_process_;
2036   std::vector<Component*> components_;
2037
2038 };
2039
2040
2041 } // namespace nnet3
2042 } // namespace kaldi
2043
2044
2045 #endif