src/nnet3/nnet-simple-component.h

   1 // nnet3/nnet-simple-component.h
   2
   3 // Copyright 2011-2013  Karel Vesely
   4 //           2012-2015  Johns Hopkins University (author: Daniel Povey)
   5 //                2013  Xiaohui Zhang
   6 //           2014-2015  Vijayaditya Peddinti
   7 //           2014-2015  Guoguo Chen
   8 //                2015  Daniel Galvez
   9 //                2015  Tom Ko
  10
  11 // See ../../COPYING for clarification regarding multiple authors
  12 //
  13 // Licensed under the Apache License, Version 2.0 (the "License");
  14 // you may not use this file except in compliance with the License.
  15 // You may obtain a copy of the License at
  16 //
  17 //  http://www.apache.org/licenses/LICENSE-2.0
  18 //
  19 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  20 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  21 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  22 // MERCHANTABLITY OR NON-INFRINGEMENT.
  23 // See the Apache 2 License for the specific language governing permissions and
  24 // limitations under the License.
  25
  26 #ifndef KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
  27 #define KALDI_NNET3_NNET_SIMPLE_COMPONENT_H_
  28
  29 #include "nnet3/nnet-common.h"
  30 #include "nnet3/nnet-component-itf.h"
  31 #include "nnet3/natural-gradient-online.h"
  32 #include <iostream>
  33
  34 namespace kaldi {
  35 namespace nnet3 {
  36
  37 /// @file  nnet-simple-component.h
  38 ///   This file contains declarations of components that are "simple", meaning
  39 ///   they don't care about the indexes they are operating on, produce one
  40 ///   output for one input, and return the kSimpleComponent flag in their
  41 ///   Properties(): for example, tanh and affine components.  In
  42 ///   nnet-general-component.h there are components that don't fit this pattern.
  43
  44 // This "nnet3" version of the p-norm component only supports the 2-norm.
  45 class PnormComponent: public Component {
  46  public:
  47   void Init(int32 input_dim, int32 output_dim);
  48   explicit PnormComponent(int32 input_dim, int32 output_dim) {
  49     Init(input_dim, output_dim);
  50   }
  51   virtual int32 Properties() const {
  52     return kSimpleComponent|kLinearInInput|kBackpropNeedsInput|kBackpropNeedsOutput;
  53   }
  54   PnormComponent(): input_dim_(0), output_dim_(0) { }
  55   virtual std::string Type() const { return "PnormComponent"; }
  56   virtual void InitFromConfig(ConfigLine *cfl);
  57   virtual int32 InputDim() const { return input_dim_; }
  58   virtual int32 OutputDim() const { return output_dim_; }
  59   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
  60                          const CuMatrixBase<BaseFloat> &in,
  61                          CuMatrixBase<BaseFloat> *out) const;
  62   virtual void Backprop(const std::string &debug_info,
  63                         const ComponentPrecomputedIndexes *indexes,
  64                         const CuMatrixBase<BaseFloat> &in_value,
  65                         const CuMatrixBase<BaseFloat> &out_value,
  66                         const CuMatrixBase<BaseFloat> &out_deriv,
  67                         Component *to_update,
  68                         CuMatrixBase<BaseFloat> *in_deriv) const;
  69   virtual Component* Copy() const { return new PnormComponent(input_dim_,
  70                                                               output_dim_); }
  71
  72   virtual void Read(std::istream &is, bool binary); // This Read function
  73   // requires that the Component has the correct type.
  74
  75   /// Write component to stream
  76   virtual void Write(std::ostream &os, bool binary) const;
  77
  78  protected:
  79   int32 input_dim_;
  80   int32 output_dim_;
  81 };
  82
  83 // This component randomly zeros dropout_proportion of the input
  84 // and the derivatives are backpropagated through the nonzero inputs.
  85 // Typically this component used during training but not in test time.
  86 // The idea is described under the name Dropout, in the paper
  87 // "Dropout: A Simple Way to Prevent Neural Networks from Overfitting".
  88 class DropoutComponent : public RandomComponent {
  89  public:
  90   void Init(int32 dim, BaseFloat dropout_proportion = 0.0,
  91             bool dropout_per_frame = false);
  92
  93   DropoutComponent(int32 dim, BaseFloat dropout = 0.0,
  94                    bool dropout_per_frame = false) {
  95     Init(dim, dropout, dropout_per_frame);
  96   }
  97
  98   DropoutComponent(): dim_(0), dropout_proportion_(0.0),
  99                       dropout_per_frame_(false) { }
 100
 101   virtual int32 Properties() const {
 102     return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
 103   }
 104   virtual std::string Type() const { return "DropoutComponent"; }
 105
 106   virtual void InitFromConfig(ConfigLine *cfl);
 107
 108   virtual int32 InputDim() const { return dim_; }
 109
 110   virtual int32 OutputDim() const { return dim_; }
 111
 112   virtual void Read(std::istream &is, bool binary);
 113
 114   // Write component to stream
 115   virtual void Write(std::ostream &os, bool binary) const;
 116
 117   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 118                          const CuMatrixBase<BaseFloat> &in,
 119                          CuMatrixBase<BaseFloat> *out) const;
 120   virtual void Backprop(const std::string &debug_info,
 121                         const ComponentPrecomputedIndexes *indexes,
 122                         const CuMatrixBase<BaseFloat> &in_value,
 123                         const CuMatrixBase<BaseFloat> &out_value,
 124                         const CuMatrixBase<BaseFloat> &out_deriv,
 125                         Component *to_update,
 126                         CuMatrixBase<BaseFloat> *in_deriv) const;
 127   virtual Component* Copy() const { return new DropoutComponent(dim_,
 128                                                dropout_proportion_,
 129                                                dropout_per_frame_); }
 130   virtual std::string Info() const;
 131
 132   void SetDropoutProportion(BaseFloat dropout_proportion) {
 133     dropout_proportion_ = dropout_proportion;
 134   }
 135
 136  private:
 137   int32 dim_;
 138   /// dropout-proportion is the proportion that is dropped out,
 139   /// e.g. if 0.1, we set 10% to zero value.
 140   BaseFloat dropout_proportion_;
 141   bool dropout_per_frame_;
 142 };
 143
 144 class ElementwiseProductComponent: public Component {
 145  public:
 146   void Init(int32 input_dim, int32 output_dim);
 147   explicit ElementwiseProductComponent(int32 input_dim, int32 output_dim) {
 148     Init(input_dim, output_dim);
 149   }
 150   virtual int32 Properties() const {
 151     return kSimpleComponent|kBackpropNeedsInput;
 152   }
 153   ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
 154   virtual std::string Type() const { return "ElementwiseProductComponent"; }
 155   virtual void InitFromConfig(ConfigLine *cfl);
 156   virtual int32 InputDim() const { return input_dim_; }
 157   virtual int32 OutputDim() const { return output_dim_; }
 158   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 159                          const CuMatrixBase<BaseFloat> &in,
 160                          CuMatrixBase<BaseFloat> *out) const;
 161   virtual void Backprop(const std::string &debug_info,
 162                         const ComponentPrecomputedIndexes *indexes,
 163                         const CuMatrixBase<BaseFloat> &in_value,
 164                         const CuMatrixBase<BaseFloat> &out_value,
 165                         const CuMatrixBase<BaseFloat> &out_deriv,
 166                         Component *to_update,
 167                         CuMatrixBase<BaseFloat> *in_deriv) const;
 168   virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
 169                                                               output_dim_); }
 170
 171   virtual void Read(std::istream &is, bool binary); // This Read function
 172   // requires that the Component has the correct type.
 173
 174   /// Write component to stream
 175   virtual void Write(std::ostream &os, bool binary) const;
 176
 177  protected:
 178   int32 input_dim_;
 179   int32 output_dim_;
 180 };
 181
 182 class NormalizeComponent: public Component {
 183  public:
 184  void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
 185   explicit NormalizeComponent(int32 input_dim,
 186                               BaseFloat target_rms = 1.0,
 187                               bool add_log_stddev = false) {
 188     Init(input_dim, target_rms, add_log_stddev);
 189   }
 190   explicit NormalizeComponent(const NormalizeComponent &other);
 191   // note: there is some special code in NonlinerComponent::Info() that
 192   // specifically caters to this class.
 193   virtual int32 Properties() const {
 194     return (add_log_stddev_ ?
 195             kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
 196             kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
 197             kBackpropAdds|kBackpropInPlace);
 198   }
 199   NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
 200   virtual std::string Type() const { return "NormalizeComponent"; }
 201   virtual void InitFromConfig(ConfigLine *cfl);
 202   virtual Component* Copy() const { return new NormalizeComponent(*this); }
 203   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 204                          const CuMatrixBase<BaseFloat> &in,
 205                          CuMatrixBase<BaseFloat> *out) const;
 206   virtual void Backprop(const std::string &debug_info,
 207                         const ComponentPrecomputedIndexes *indexes,
 208                         const CuMatrixBase<BaseFloat> &in_value,
 209                         const CuMatrixBase<BaseFloat> &, // out_value
 210                         const CuMatrixBase<BaseFloat> &out_deriv,
 211                         Component *to_update,
 212                         CuMatrixBase<BaseFloat> *in_deriv) const;
 213
 214   virtual void Read(std::istream &is, bool binary);
 215   virtual void Write(std::ostream &os, bool binary) const;
 216   virtual int32 InputDim() const { return input_dim_; }
 217   virtual int32 OutputDim() const {
 218     return (input_dim_ + (add_log_stddev_ ? 1 : 0));
 219   }
 220   virtual std::string Info() const;
 221  private:
 222   NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
 223   enum { kExpSquaredNormFloor = -66 };
 224   static const BaseFloat kSquaredNormFloor;
 225   int32 input_dim_;
 226   BaseFloat target_rms_; // The target rms for outputs.
 227   // about 0.7e-20.  We need a value that's exactly representable in
 228   // float and whose inverse square root is also exactly representable
 229   // in float (hence, an even power of two).
 230
 231   bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
 232                         // is an extra dimension of the output.
 233 };
 234
 235
 236 /*
 237    Implements the sigmoid nonlinearity, i.e. the function y = exp(-x).
 238
 239    Configuration values accepted:
 240       dim              Dimension of this component, e.g. 1024
 241
 242    Configuration values inherited from NonlinearComponent, and their
 243    local meanings:
 244       self-repair-lower-threshold e.g. self-repair-lower-threshold=0.05.  This
 245                     controls the self-repair mechanism, which for sigmoid units
 246                     consists of identifying units which are oversaturated (i.e.
 247                     usually close to -1 or +1) and nudging the inputs to be
 248                     closer to zero.  It gates on the average derivative of the
 249                     nonlinearity, which for sigmoid is a value between 0 and
 250                     0.25.  For units where the average function-derivative
 251                     accumulated during this iteration (job) of training is less
 252                     than this threshold, we activate self-repair, which consists
 253                     of adding (-self-repair-scale * (2*the output of the
 254                     nonlinearity - 1.0)) to the backpropagated derivatives.
 255                     This just happens to be a convenient-to-compute function
 256                     that's +1 for large negative inputs, and -1 for large positive
 257                     inputs, and smooth in between.
 258                     The default value of this is -1000, which the code internally
 259                     maps to 0.05 which is suitable for sigmoid units; if you do set it,
 260                     you can set it to a value like 0.025 or 0.075.
 261       self-repair-scale  Scale for the self-repair mechanism; see comments above.
 262                     default=0, but we usually set this to 1.0e-05 (or
 263                     occasionally 1.0e-04) in the scripts.
 264
 265  */
 266 class SigmoidComponent: public NonlinearComponent {
 267  public:
 268   explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
 269   SigmoidComponent() { }
 270   virtual std::string Type() const { return "SigmoidComponent"; }
 271   virtual int32 Properties() const {
 272     return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
 273   }
 274   virtual Component* Copy() const { return new SigmoidComponent(*this); }
 275   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 276                          const CuMatrixBase<BaseFloat> &in,
 277                          CuMatrixBase<BaseFloat> *out) const;
 278   virtual void Backprop(const std::string &debug_info,
 279                         const ComponentPrecomputedIndexes *indexes,
 280                         const CuMatrixBase<BaseFloat> &, //in_value
 281                         const CuMatrixBase<BaseFloat> &out_value,
 282                         const CuMatrixBase<BaseFloat> &out_deriv,
 283                         Component *to_update,
 284                         CuMatrixBase<BaseFloat> *in_deriv) const;
 285   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 286  private:
 287   // this function is called from Backprop code and only does something if the
 288   // self-repair-scale config value is set.
 289   void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
 290                        CuMatrixBase<BaseFloat> *in_deriv,
 291                        SigmoidComponent *to_update) const;
 292
 293   SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
 294 };
 295
 296 /*
 297    Implements the tanh nonlinearity, i.e. the function y = tanh(x).
 298
 299    Configuration values accepted:
 300       dim           Dimension of this component, e.g. 1024
 301
 302    Configuration values inherited from NonlinearComponent, and their
 303    local meanings:
 304       self-repair-lower-threshold e.g. self-repair-lower-threshold=0.2.  This
 305                     controls the self-repair mechanism, which for tanh units
 306                     consists of identifying units which are oversaturated (i.e.
 307                     usually close to -1 or +1) and nudging the inputs to be
 308                     closer to zero.  It gates on the average derivative of
 309                     the nonlinearity, which for tanh is a value between 0 and 1.
 310                     For units where the average function-derivative accumulated
 311                     during this iteration (job) of training is less than
 312                     this threshold, we activate self-repair, which consists of
 313                     adding (-self-repair-scale * the output of the nonlinearity),
 314                     i.e. (-self-repair-scale * tanh(x)) to the backpropagated
 315                     derivatives.
 316                     The default value of this is -1000, which the code internally
 317                     maps to 0.2 which is suitable for tanh units; if you do set it,
 318                     you can set it to a value like 0.1 or 0.3.
 319       self-repair-scale  Scale for the self-repair mechanism; see comments above.
 320                     default=0, but we usually set this to 1.0e-05 (or
 321                     occasionally 1.0e-04) in the scripts.
 322  */
 323 class TanhComponent: public NonlinearComponent {
 324  public:
 325   explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
 326   TanhComponent() { }
 327   virtual std::string Type() const { return "TanhComponent"; }
 328   virtual Component* Copy() const { return new TanhComponent(*this); }
 329   virtual int32 Properties() const {
 330     return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|kStoresStats;
 331   }
 332   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 333                          const CuMatrixBase<BaseFloat> &in,
 334                          CuMatrixBase<BaseFloat> *out) const;
 335   virtual void Backprop(const std::string &debug_info,
 336                         const ComponentPrecomputedIndexes *indexes,
 337                         const CuMatrixBase<BaseFloat> &, //in_value
 338                         const CuMatrixBase<BaseFloat> &out_value,
 339                         const CuMatrixBase<BaseFloat> &out_deriv,
 340                         Component *to_update,
 341                         CuMatrixBase<BaseFloat> *in_deriv) const;
 342   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 343  private:
 344   // this function is called from Backprop code and only does something if the
 345   // self-repair-scale config value is set.
 346   void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
 347                        CuMatrixBase<BaseFloat> *in_deriv,
 348                        TanhComponent *to_update) const;
 349
 350   TanhComponent &operator = (const TanhComponent &other); // Disallow.
 351 };
 352
 353
 354 /*
 355    Implements the Rectified Linear Unit nonlinearity, a.k.a. ReLU.
 356
 357    Configuration values accepted:
 358       dim              Dimension of this component, e.g. 1024
 359
 360    Configuration values inherited from NonlinearComponent, and their
 361    local meanings:
 362       self-repair-lower-threshold e.g. self-repair-lower-threshold=0.05.  (Lower
 363                        threshold for self-repair, if set; in this case acts on
 364                        the average function-derivative, which is the proportion
 365                        of the time the output is > 0.  For any unit where the
 366                        average function-derivative is lower than this threshold,
 367                        we add 'self-repair-scale' to the backpropagated
 368                        derivatives in backprop.  There is no default
 369                        (default=-1000, which is interpreted specially).
 370       self-repair-upper-threshold e.g. self-repair-upper-threshold=0.95.
 371                        Like self-repair-lower-threshold, but controls self-repair
 372                        for units that are active *too* much of the time.  Units
 373                        whose average function-derivative exceeds this threshold
 374                        will have the negative of 'self-repair-scale' added to their
 375                        input derivatives in backprop.  There is no default
 376                        (default=-1000, which is interpreted specially).
 377       self-repair-scale  Scale for the self-repair mechanism; see comments for
 378                        self-repair-lower-threshold and self-repair-upper-threshold
 379                        for details.  default=0, but we usually set this to 1.0e-05
 380                        (or occasionally 1.0e-04) in the scripts.
 381  */
 382 class RectifiedLinearComponent: public NonlinearComponent {
 383  public:
 384   explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
 385       NonlinearComponent(other) { }
 386   RectifiedLinearComponent() { }
 387   virtual std::string Type() const { return "RectifiedLinearComponent"; }
 388   virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
 389   virtual int32 Properties() const {
 390     return kSimpleComponent|kLinearInInput|kBackpropNeedsOutput|kPropagateInPlace|
 391         kStoresStats;
 392   }
 393   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 394                          const CuMatrixBase<BaseFloat> &in,
 395                          CuMatrixBase<BaseFloat> *out) const;
 396   virtual void Backprop(const std::string &debug_info,
 397                         const ComponentPrecomputedIndexes *indexes,
 398                         const CuMatrixBase<BaseFloat> &, //in_value
 399                         const CuMatrixBase<BaseFloat> &out_value,
 400                         const CuMatrixBase<BaseFloat> &out_deriv,
 401                         Component *to_update,
 402                         CuMatrixBase<BaseFloat> *in_deriv) const;
 403   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 404
 405  private:
 406   // this function is called from Backprop code and only does something if the
 407   // self-repair-scale config value is set.
 408   void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv,
 409                        RectifiedLinearComponent *to_update) const;
 410
 411   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 412 };
 413
 414 /**
 415    This component is a fixed (non-trainable) nonlinearity that sums its inputs
 416    to produce outputs.  Currently the only supported configuration is that its
 417    input-dim is interpreted as consisting of n blocks, and the output is just a
 418    summation over the n blocks, where  n = input-dim / output-dim, so for instance
 419     output[n] = input[n] + input[block-size + n] + .... .
 420    Later if needed we can add a configuration variable that allows you to sum
 421    over 'interleaved' input.
 422  */
 423 class SumReduceComponent: public Component {
 424  public:
 425   void Init(int32 input_dim, int32 output_dim);
 426   explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
 427     Init(input_dim, output_dim);
 428   }
 429   virtual int32 Properties() const {
 430     return kSimpleComponent|kLinearInInput;
 431   }
 432   SumReduceComponent(): input_dim_(0), output_dim_(0) { }
 433   virtual std::string Type() const { return "SumReduceComponent"; }
 434   virtual void InitFromConfig(ConfigLine *cfl);
 435   virtual int32 InputDim() const { return input_dim_; }
 436   virtual int32 OutputDim() const { return output_dim_; }
 437   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 438                          const CuMatrixBase<BaseFloat> &in,
 439                          CuMatrixBase<BaseFloat> *out) const;
 440   virtual void Backprop(const std::string &debug_info,
 441                         const ComponentPrecomputedIndexes *indexes,
 442                         const CuMatrixBase<BaseFloat> &, // in_value
 443                         const CuMatrixBase<BaseFloat> &, // out_value,
 444                         const CuMatrixBase<BaseFloat> &out_deriv,
 445                         Component *, // to_update
 446                         CuMatrixBase<BaseFloat> *in_deriv) const;
 447   virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
 448                                                                   output_dim_); }
 449
 450   virtual void Read(std::istream &is, bool binary); // This Read function
 451   // requires that the Component has the correct type.
 452
 453   /// Write component to stream
 454   virtual void Write(std::ostream &os, bool binary) const;
 455
 456  protected:
 457   int32 input_dim_;
 458   int32 output_dim_;
 459 };
 460
 461
 462 class FixedAffineComponent;
 463 class FixedScaleComponent;
 464 class PerElementScaleComponent;
 465 class PerElementOffsetComponent;
 466
 467 // Affine means a linear function plus an offset.
 468 // Note: although this class can be instantiated, it also
 469 // functions as a base-class for more specialized versions of
 470 // AffineComponent.
 471 class AffineComponent: public UpdatableComponent {
 472   friend class SoftmaxComponent; // Friend declaration relates to mixing up.
 473  public:
 474
 475   virtual int32 InputDim() const { return linear_params_.NumCols(); }
 476   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 477
 478   virtual std::string Info() const;
 479   virtual void InitFromConfig(ConfigLine *cfl);
 480
 481   AffineComponent() { } // use Init to really initialize.
 482   virtual std::string Type() const { return "AffineComponent"; }
 483   virtual int32 Properties() const {
 484     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 485         kBackpropNeedsInput|kBackpropAdds;
 486   }
 487
 488
 489   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 490                          const CuMatrixBase<BaseFloat> &in,
 491                          CuMatrixBase<BaseFloat> *out) const;
 492   virtual void Backprop(const std::string &debug_info,
 493                         const ComponentPrecomputedIndexes *indexes,
 494                         const CuMatrixBase<BaseFloat> &in_value,
 495                         const CuMatrixBase<BaseFloat> &, // out_value
 496                         const CuMatrixBase<BaseFloat> &out_deriv,
 497                         Component *to_update,
 498                         CuMatrixBase<BaseFloat> *in_deriv) const;
 499
 500   virtual void Read(std::istream &is, bool binary);
 501   virtual void Write(std::ostream &os, bool binary) const;
 502
 503   virtual Component* Copy() const;
 504
 505
 506   // Some functions from base-class UpdatableComponent.
 507   virtual void Scale(BaseFloat scale);
 508   virtual void Add(BaseFloat alpha, const Component &other);
 509   virtual void PerturbParams(BaseFloat stddev);
 510   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 511   virtual int32 NumParameters() const;
 512   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 513   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 514
 515   // Some functions that are specific to this class.
 516
 517   // This new function is used when mixing up:
 518   virtual void SetParams(const VectorBase<BaseFloat> &bias,
 519                          const MatrixBase<BaseFloat> &linear);
 520   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
 521   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 522   explicit AffineComponent(const AffineComponent &other);
 523   // The next constructor is used in converting from nnet1.
 524   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
 525                   const CuVectorBase<BaseFloat> &bias_params,
 526                   BaseFloat learning_rate);
 527   void Init(int32 input_dim, int32 output_dim,
 528             BaseFloat param_stddev, BaseFloat bias_stddev);
 529   void Init(std::string matrix_filename);
 530
 531   // This function resizes the dimensions of the component, setting the
 532   // parameters to zero, while leaving any other configuration values the same.
 533   virtual void Resize(int32 input_dim, int32 output_dim);
 534
 535   // The following functions are used for collapsing multiple layers
 536   // together.  They return a pointer to a new Component equivalent to
 537   // the sequence of two components.  We haven't implemented this for
 538   // FixedLinearComponent yet.
 539   Component *CollapseWithNext(const AffineComponent &next) const ;
 540   Component *CollapseWithNext(const FixedAffineComponent &next) const;
 541   Component *CollapseWithNext(const FixedScaleComponent &next) const;
 542   Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
 543
 544  protected:
 545   friend class NaturalGradientAffineComponent;
 546   // This function Update() is for extensibility; child classes may override
 547   // this, e.g. for natural gradient update.
 548   virtual void Update(
 549       const std::string &debug_info,
 550       const CuMatrixBase<BaseFloat> &in_value,
 551       const CuMatrixBase<BaseFloat> &out_deriv) {
 552     UpdateSimple(in_value, out_deriv);
 553   }
 554   // UpdateSimple is used when *this is a gradient.  Child classes may override
 555   // this if needed, but typically won't need to.
 556   virtual void UpdateSimple(
 557       const CuMatrixBase<BaseFloat> &in_value,
 558       const CuMatrixBase<BaseFloat> &out_deriv);
 559
 560   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
 561   CuMatrix<BaseFloat> linear_params_;
 562   CuVector<BaseFloat> bias_params_;
 563 };
 564
 565 class RepeatedAffineComponent;
 566
 567 /// This class implements an affine transform using a block diagonal matrix
 568 /// e.g., one whose weight matrix is all zeros except for blocks on the
 569 /// diagonal. All these blocks have the same dimensions.
 570 ///  input-dim: num cols of block diagonal matrix.
 571 ///  output-dim: num rows of block diagonal matrix.
 572 /// num-blocks: number of blocks in diagonal of the matrix.
 573 /// num-blocks must divide both input-dim and output-dim
 574 class BlockAffineComponent : public UpdatableComponent {
 575  public:
 576   virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
 577   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 578
 579   virtual std::string Info() const;
 580   virtual void InitFromConfig(ConfigLine *cfl);
 581
 582   BlockAffineComponent() { }
 583   virtual std::string Type() const { return "BlockAffineComponent"; }
 584   virtual int32 Properties() const {
 585     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 586       kBackpropNeedsInput|kBackpropAdds;
 587   }
 588
 589   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 590                          const CuMatrixBase<BaseFloat> &in,
 591                          CuMatrixBase<BaseFloat> *out) const;
 592
 593   virtual void Backprop(const std::string &debug_info,
 594                         const ComponentPrecomputedIndexes *indexes,
 595                         const CuMatrixBase<BaseFloat> &in_value,
 596                         const CuMatrixBase<BaseFloat> &, // out_value
 597                         const CuMatrixBase<BaseFloat> &out_deriv,
 598                         Component *to_update,
 599                         CuMatrixBase<BaseFloat> *in_deriv) const;
 600
 601   virtual void Read(std::istream &is, bool binary);
 602   virtual void Write(std::ostream &os, bool binary) const;
 603
 604   virtual Component* Copy() const;
 605
 606   // Functions from base-class UpdatableComponent.
 607   virtual void Scale(BaseFloat scale);
 608   virtual void Add(BaseFloat alpha, const Component &other);
 609   virtual void PerturbParams(BaseFloat stddev);
 610   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 611   virtual int32 NumParameters() const;
 612   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 613   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 614
 615   // BlockAffine-specific functions.
 616   void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
 617             BaseFloat param_stddev, BaseFloat bias_mean,
 618             BaseFloat bias_stddev);
 619   explicit BlockAffineComponent(const BlockAffineComponent &other);
 620   explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
 621  protected:
 622   // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
 623   // equal size.  The blocks are stored in linear_params_ as
 624   // [ M
 625   //   N
 626   //   O ] but we actually treat it as the matrix:
 627   // [ M 0 0
 628   //   0 N 0
 629   //   0 0 O ]
 630   CuMatrix<BaseFloat> linear_params_;
 631   CuVector<BaseFloat> bias_params_;
 632   int32 num_blocks_;
 633  private:
 634   const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
 635 };
 636
 637 class RepeatedAffineComponent: public UpdatableComponent {
 638  public:
 639
 640   virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
 641   virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
 642
 643   virtual std::string Info() const;
 644   virtual void InitFromConfig(ConfigLine *cfl);
 645
 646   RepeatedAffineComponent() { } // use Init to really initialize.
 647   virtual std::string Type() const { return "RepeatedAffineComponent"; }
 648   virtual int32 Properties() const {
 649     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
 650         kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
 651   }
 652   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 653                          const CuMatrixBase<BaseFloat> &in,
 654                          CuMatrixBase<BaseFloat> *out) const;
 655   virtual void Backprop(const std::string &debug_info,
 656                         const ComponentPrecomputedIndexes *indexes,
 657                         const CuMatrixBase<BaseFloat> &in_value,
 658                         const CuMatrixBase<BaseFloat> &, // out_value
 659                         const CuMatrixBase<BaseFloat> &out_deriv,
 660                         Component *to_update,
 661                         CuMatrixBase<BaseFloat> *in_deriv) const;
 662
 663   virtual void Read(std::istream &is, bool binary);
 664   virtual void Write(std::ostream &os, bool binary) const;
 665
 666   virtual Component* Copy() const;
 667
 668   // Some functions from base-class UpdatableComponent.
 669   virtual void Scale(BaseFloat scale);
 670   virtual void Add(BaseFloat alpha, const Component &other);
 671   virtual void PerturbParams(BaseFloat stddev);
 672   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
 673   virtual int32 NumParameters() const;
 674   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
 675   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 676
 677   // Some functions that are specific to this class.
 678   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
 679   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 680   explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
 681
 682   void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
 683             BaseFloat param_stddev, BaseFloat bias_mean,
 684             BaseFloat bias_stddev);
 685   friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
 686  protected:
 687   // This function Update(), called from backprop, is broken out for
 688   // extensibility to natural gradient update.
 689   virtual void Update(
 690       const CuMatrixBase<BaseFloat> &in_value,
 691       const CuMatrixBase<BaseFloat> &out_deriv);
 692
 693   // This function does nothing here but is redefined in child-class
 694   // NaturalGradientRepeatedAffineComponent.  This help avoid repeated code.
 695   virtual void SetNaturalGradientConfigs() { }
 696
 697   const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
 698   CuMatrix<BaseFloat> linear_params_;
 699   CuVector<BaseFloat> bias_params_;
 700   int32 num_repeats_;
 701 };
 702
 703 class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
 704  public:
 705   // Use Init() to really initialize.
 706   NaturalGradientRepeatedAffineComponent() { }
 707
 708   // Most of the public functions are inherited from RepeatedAffineComponent.
 709   virtual std::string Type() const {
 710     return "NaturalGradientRepeatedAffineComponent";
 711   }
 712
 713   virtual Component* Copy() const;
 714
 715   // Copy constructor
 716   explicit NaturalGradientRepeatedAffineComponent(
 717       const NaturalGradientRepeatedAffineComponent &other);
 718  private:
 719   virtual void Update(
 720       const CuMatrixBase<BaseFloat> &in_value,
 721       const CuMatrixBase<BaseFloat> &out_deriv);
 722
 723   const NaturalGradientRepeatedAffineComponent &operator=(
 724       const NaturalGradientRepeatedAffineComponent &other); // Disallow.
 725
 726   // Applies the default configuration to preconditioner_in_.
 727   virtual void SetNaturalGradientConfigs();
 728
 729   // For efficiency reasons we only apply the natural gradient to the input
 730   // side, i.e. not to the space of output derivatives-- we believe the input
 731   // side is the more important side.  We don't make the natural-gradient
 732   // configurable; we just give it a reasonable configuration.
 733   // Instead of using the individual data-points, for efficiency reasons we use
 734   // the distribution of per-minibatch summed derivatives over each dimension of
 735   // the output space, as the source for the Fisher matrix.
 736   OnlineNaturalGradient preconditioner_in_;
 737 };
 738
 739 class SoftmaxComponent: public NonlinearComponent {
 740  public:
 741   explicit SoftmaxComponent(const SoftmaxComponent &other):
 742       NonlinearComponent(other) { }
 743   SoftmaxComponent() { }
 744   virtual std::string Type() const { return "SoftmaxComponent"; }
 745   virtual int32 Properties() const {
 746     return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
 747   }
 748   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 749                          const CuMatrixBase<BaseFloat> &in,
 750                          CuMatrixBase<BaseFloat> *out) const;
 751   virtual void Backprop(const std::string &debug_info,
 752                         const ComponentPrecomputedIndexes *indexes,
 753                         const CuMatrixBase<BaseFloat> &in_value,
 754                         const CuMatrixBase<BaseFloat> &out_value,
 755                         const CuMatrixBase<BaseFloat> &out_deriv,
 756                         Component *to_update,
 757                         CuMatrixBase<BaseFloat> *in_deriv) const;
 758   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 759
 760   virtual Component* Copy() const { return new SoftmaxComponent(*this); }
 761  private:
 762   SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
 763 };
 764
 765
 766 /*
 767    Implements the log of a softmax nonlinearity, so it's the same
 768    as shifting each input vector by a constant offset so that, when
 769    exponentiated, it would sum to one.
 770
 771    We usually use this in place of softmax because the log-scale
 772    output will not saturate.
 773
 774    Configuration values accepted:
 775       dim            e.g. dim=8061.   Usually this is the last component
 776                      in a network, so 'dim' is the number of classes.
 777  */
 778 class LogSoftmaxComponent: public NonlinearComponent {
 779  public:
 780   explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
 781       NonlinearComponent(other) { }
 782   LogSoftmaxComponent() { }
 783   virtual std::string Type() const { return "LogSoftmaxComponent"; }
 784   virtual int32 Properties() const {
 785     return kSimpleComponent|kBackpropNeedsOutput|kStoresStats;
 786   }
 787   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 788                          const CuMatrixBase<BaseFloat> &in,
 789                          CuMatrixBase<BaseFloat> *out) const;
 790   virtual void Backprop(const std::string &debug_info,
 791                         const ComponentPrecomputedIndexes *indexes,
 792                         const CuMatrixBase<BaseFloat> &in_value,
 793                         const CuMatrixBase<BaseFloat> &out_value,
 794                         const CuMatrixBase<BaseFloat> &out_deriv,
 795                         Component *to_update,
 796                         CuMatrixBase<BaseFloat> *in_deriv) const;
 797
 798   virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
 799  private:
 800   LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
 801 };
 802
 803 /*
 804   Keywords: natural gradient descent, NG-SGD, naturalgradient.  For
 805   the top-level of the natural gradient code look here, and also in
 806   nnet-precondition-online.h.
 807   NaturalGradientAffineComponent is
 808   a version of AffineComponent that has a non-(multiple of unit) learning-rate
 809   matrix.  See nnet-precondition-online.h for a description of the technique.
 810   It is described, under the name Online NG-SGD, in the paper "Parallel
 811   training of DNNs with Natural Gradient and Parameter Averaging" (ICLR
 812   workshop, 2015) by Daniel Povey, Xiaohui Zhang and Sanjeev Khudanpur.
 813
 814   Configuration values accepted by this component:
 815
 816   Values inherited from UpdatableComponent (see its declaration in
 817   nnet-component-itf for details):
 818      learning-rate
 819      learning-rate-factor
 820      max-change
 821
 822   Values used in initializing the component's parameters:
 823      input-dim             e.g. input-dim=1024.  The input dimension.
 824      output-dim            e.g. output-dim=1024.  The output dimension.
 825      param-stddev          e.g. param-stddev=0.025.  The standard deviation
 826                            used to randomly initialize the linear parameters
 827                            (as Gaussian random values * param-stddev).
 828                            Defaults to 1/sqrt(input-dim), which is Glorot
 829                            initialization.
 830      bias-stddev           e.g. bias-stddev=0.0.  The standard deviation
 831                            used to randomly initialize the bias parameters.
 832                            Defaults to 1.0 but we usually set it to 0.0
 833                            in the config.
 834      bias-mean             e.g. bias-mean=1.0.  Allows you to ininialize the
 835                            bias parameters with an offset.  Default is 0.0
 836                            which is normally suitable
 837
 838      matrix                e.g. matrix=foo/bar/init.mat  May be used as an
 839                            alternative to (input-dim, output-dim, param-stddev,
 840                            bias-stddev, bias-mean) to initialize the parameters.
 841                            Dimension is output-dim by (input-dim + 1), last
 842                            column is interpreted as the bias.
 843
 844    Options to the natural gradient (you won't normally have to set these,
 845    the defaults are suitable):
 846
 847       num-samples-history   Number of frames used as the time-constant to
 848                             determine how 'up-to-date' the Fisher-matrix
 849                             estimates are.  Smaller -> more up-to-date, but more
 850                             noisy.  default=2000.
 851       alpha                 Constant that determines how much we smooth the
 852                             Fisher-matrix estimates with the unit matrix.
 853                             Larger means more smoothing. default=4.0
 854       rank-in               Rank used in low-rank-plus-unit estimate of Fisher
 855                             matrix in the input space.  default=20.
 856       rank-out              Rank used in low-rank-plus-unit estimate of Fisher
 857                             matrix in the output-derivative space.  default=80.
 858       update-period         Determines after with what frequency (in
 859                             minibatches) we update the Fisher-matrix estimates;
 860                             making this > 1 saves a little time in training.
 861                             default=4.
 862 */
 863 class NaturalGradientAffineComponent: public AffineComponent {
 864  public:
 865   virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
 866   virtual void Read(std::istream &is, bool binary);
 867   virtual void Write(std::ostream &os, bool binary) const;
 868   void Init(int32 input_dim, int32 output_dim,
 869             BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
 870             int32 rank_in, int32 rank_out, int32 update_period,
 871             BaseFloat num_samples_history, BaseFloat alpha);
 872   void Init(int32 rank_in, int32 rank_out, int32 update_period,
 873             BaseFloat num_samples_history,
 874             BaseFloat alpha, std::string matrix_filename);
 875   // this constructor does not really initialize, use Init() or Read().
 876   NaturalGradientAffineComponent();
 877   void Resize(int32 input_dim, int32 output_dim);
 878   void InitFromConfig(ConfigLine *cfl);
 879   virtual std::string Info() const;
 880   virtual Component* Copy() const;
 881   virtual void Scale(BaseFloat scale);
 882   virtual void Add(BaseFloat alpha, const Component &other);
 883   // copy constructor
 884   explicit NaturalGradientAffineComponent(
 885       const NaturalGradientAffineComponent &other);
 886  private:
 887   // disallow assignment operator.
 888   NaturalGradientAffineComponent &operator= (
 889       const NaturalGradientAffineComponent&);
 890
 891   // Configs for preconditioner.  The input side tends to be better conditioned ->
 892   // smaller rank needed, so make them separately configurable.
 893   int32 rank_in_;
 894   int32 rank_out_;
 895   int32 update_period_;
 896   BaseFloat num_samples_history_;
 897   BaseFloat alpha_;
 898
 899   OnlineNaturalGradient preconditioner_in_;
 900
 901   OnlineNaturalGradient preconditioner_out_;
 902
 903   // Sets the configs rank, alpha and eta in the preconditioner objects,
 904   // from the class variables.
 905   void SetNaturalGradientConfigs();
 906
 907   virtual void Update(
 908       const std::string &debug_info,
 909       const CuMatrixBase<BaseFloat> &in_value,
 910       const CuMatrixBase<BaseFloat> &out_deriv);
 911 };
 912
 913
 914 /// FixedAffineComponent is an affine transform that is supplied
 915 /// at network initialization time and is not trainable.
 916 class FixedAffineComponent: public Component {
 917  public:
 918   FixedAffineComponent() { }
 919   virtual std::string Type() const { return "FixedAffineComponent"; }
 920   virtual std::string Info() const;
 921
 922   // Copy constructor from AffineComponent-- can be used when we're done
 923   // training a particular part of the model and want to efficiently disable
 924   // further training.
 925   FixedAffineComponent(const AffineComponent &c);
 926
 927   /// matrix should be of size input-dim+1 to output-dim, last col is offset
 928   void Init(const CuMatrixBase<BaseFloat> &matrix);
 929
 930   // The ConfigLine cfl contains just the option matrix=<string>,
 931   // where the string is the filename of a Kaldi-format matrix to read.
 932   virtual void InitFromConfig(ConfigLine *cfl);
 933
 934   virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
 935   virtual int32 InputDim() const { return linear_params_.NumCols(); }
 936   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 937
 938   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 939                          const CuMatrixBase<BaseFloat> &in,
 940                          CuMatrixBase<BaseFloat> *out) const;
 941   virtual void Backprop(const std::string &debug_info,
 942                         const ComponentPrecomputedIndexes *indexes,
 943                         const CuMatrixBase<BaseFloat> &in_value,
 944                         const CuMatrixBase<BaseFloat> &, // out_value
 945                         const CuMatrixBase<BaseFloat> &out_deriv,
 946                         Component *to_update,
 947                         CuMatrixBase<BaseFloat> *in_deriv) const;
 948
 949
 950   virtual Component* Copy() const;
 951   virtual void Read(std::istream &is, bool binary);
 952   virtual void Write(std::ostream &os, bool binary) const;
 953
 954   // Function to provide access to linear_params_.
 955   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
 956  protected:
 957   friend class AffineComponent;
 958   CuMatrix<BaseFloat> linear_params_;
 959   CuVector<BaseFloat> bias_params_;
 960
 961   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
 962 };
 963
 964 /// SumGroupComponent is used to sum up groups of posteriors.
 965 /// It's used to introduce a kind of Gaussian-mixture-model-like
 966 /// idea into neural nets.  This is basically a degenerate case of
 967 /// MixtureProbComponent; we had to implement it separately to
 968 /// be efficient for CUDA (we can use this one regardless whether
 969 /// we have CUDA or not; it's the normal case we want anyway).
 970 ///
 971 /// There are two forms of initialization in a config file: one
 972 /// where the number of elements are specified for each group
 973 /// individually as a vector, and one where only the total input
 974 /// dimension and the output dimension (number of groups) is specified.
 975 /// The second is used when all groups have the same size.
 976 class SumGroupComponent: public Component {
 977 public:
 978   virtual int32 InputDim() const { return input_dim_; }
 979   virtual int32 OutputDim() const { return output_dim_; }
 980   void Init(const std::vector<int32> &sizes); // the vector is of the input dim
 981                                               // (>= 1) for each output dim.
 982   void Init(int32 input_dim, int32 output_dim);
 983   void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
 984                                                   // each output-dim, how many
 985                                                   // inputs were summed over.
 986   virtual void InitFromConfig(ConfigLine *cfl);
 987   SumGroupComponent() { }
 988   virtual std::string Type() const { return "SumGroupComponent"; }
 989   virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
 990   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
 991                          const CuMatrixBase<BaseFloat> &in,
 992                          CuMatrixBase<BaseFloat> *out) const;
 993   virtual void Backprop(const std::string &debug_info,
 994                         const ComponentPrecomputedIndexes *indexes,
 995                         const CuMatrixBase<BaseFloat> &in_value,
 996                         const CuMatrixBase<BaseFloat> &, // out_value
 997                         const CuMatrixBase<BaseFloat> &out_deriv,
 998                         Component *to_update,
 999                         CuMatrixBase<BaseFloat> *in_deriv) const;
1000   virtual Component* Copy() const;
1001   virtual void Read(std::istream &is, bool binary);
1002   virtual void Write(std::ostream &os, bool binary) const;
1003
1004 private:
1005   KALDI_DISALLOW_COPY_AND_ASSIGN(SumGroupComponent);
1006   // Note: Int32Pair is just struct{ int32 first; int32 second }; it's defined
1007   // in cu-matrixdim.h as extern "C" which is needed for the CUDA interface.
1008   CuArray<Int32Pair> indexes_; // for each output index, the (start, end) input
1009                                // index.
1010   CuArray<int32> reverse_indexes_; // for each input index, the output index.
1011   int32 input_dim_;
1012   int32 output_dim_;
1013 };
1014
1015
1016 /// FixedScaleComponent applies a fixed per-element scale; it's similar
1017 /// to the Rescale component in the nnet1 setup (and only needed for nnet1
1018 /// model conversion).
1019 class FixedScaleComponent: public Component {
1020  public:
1021   FixedScaleComponent() { }
1022   virtual std::string Type() const { return "FixedScaleComponent"; }
1023   virtual std::string Info() const;
1024   virtual int32 Properties() const {
1025     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
1026   }
1027
1028   void Init(const CuVectorBase<BaseFloat> &scales);
1029
1030   // The ConfigLine cfl contains only the option scales=<string>,
1031   // where the string is the filename of a Kaldi-format matrix to read.
1032   virtual void InitFromConfig(ConfigLine *cfl);
1033
1034   virtual int32 InputDim() const { return scales_.Dim(); }
1035   virtual int32 OutputDim() const { return scales_.Dim(); }
1036
1037   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1038                          const CuMatrixBase<BaseFloat> &in,
1039                          CuMatrixBase<BaseFloat> *out) const;
1040   virtual void Backprop(const std::string &debug_info,
1041                         const ComponentPrecomputedIndexes *indexes,
1042                         const CuMatrixBase<BaseFloat> &, // in_value
1043                         const CuMatrixBase<BaseFloat> &, // out_value
1044                         const CuMatrixBase<BaseFloat> &out_deriv,
1045                         Component *, // to_update
1046                         CuMatrixBase<BaseFloat> *in_deriv) const;
1047   virtual Component* Copy() const;
1048   virtual void Read(std::istream &is, bool binary);
1049   virtual void Write(std::ostream &os, bool binary) const;
1050
1051  protected:
1052   friend class AffineComponent;  // necessary for collapse
1053   CuVector<BaseFloat> scales_;
1054   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
1055 };
1056
1057
1058 /// FixedBiasComponent applies a fixed per-element bias; it's similar
1059 /// to the AddShift component in the nnet1 setup (and only needed for nnet1
1060 /// model conversion.
1061 class FixedBiasComponent: public Component {
1062  public:
1063   FixedBiasComponent() { }
1064   virtual std::string Type() const { return "FixedBiasComponent"; }
1065   virtual std::string Info() const;
1066
1067   virtual int32 Properties() const {
1068     return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
1069   }
1070
1071   void Init(const CuVectorBase<BaseFloat> &scales);
1072
1073   // The ConfigLine cfl contains only the option bias=<string>,
1074   // where the string is the filename of a Kaldi-format matrix to read.
1075   virtual void InitFromConfig(ConfigLine *cfl);
1076   virtual int32 InputDim() const { return bias_.Dim(); }
1077   virtual int32 OutputDim() const { return bias_.Dim(); }
1078   using Component::Propagate; // to avoid name hiding
1079   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1080                          const CuMatrixBase<BaseFloat> &in,
1081                          CuMatrixBase<BaseFloat> *out) const;
1082   virtual void Backprop(const std::string &debug_info,
1083                         const ComponentPrecomputedIndexes *indexes,
1084                         const CuMatrixBase<BaseFloat> &, // in_value,
1085                         const CuMatrixBase<BaseFloat> &, // out_value
1086                         const CuMatrixBase<BaseFloat> &out_deriv,
1087                         Component *, // to_update
1088                         CuMatrixBase<BaseFloat> *in_deriv) const;
1089   virtual Component* Copy() const;
1090   virtual void Read(std::istream &is, bool binary);
1091   virtual void Write(std::ostream &os, bool binary) const;
1092
1093  protected:
1094   CuVector<BaseFloat> bias_;
1095   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
1096 };
1097
1098 // NoOpComponent just duplicates its input.  We don't anticipate this being used
1099 // very often, but it may sometimes make your life easier
1100 class NoOpComponent: public NonlinearComponent {
1101  public:
1102   explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
1103   NoOpComponent() { }
1104   virtual std::string Type() const { return "NoOpComponent"; }
1105   virtual int32 Properties() const {
1106     return kSimpleComponent|kLinearInInput|kPropagateInPlace;
1107   }
1108   virtual Component* Copy() const { return new NoOpComponent(*this); }
1109   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1110                          const CuMatrixBase<BaseFloat> &in,
1111                          CuMatrixBase<BaseFloat> *out) const;
1112   virtual void Backprop(const std::string &debug_info,
1113                         const ComponentPrecomputedIndexes *indexes,
1114                         const CuMatrixBase<BaseFloat> &, //in_value
1115                         const CuMatrixBase<BaseFloat> &, // out_value,
1116                         const CuMatrixBase<BaseFloat> &out_deriv,
1117                         Component *to_update,
1118                         CuMatrixBase<BaseFloat> *in_deriv) const;
1119  private:
1120   NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
1121 };
1122
1123 // ClipGradientComponent just duplicates its input, but clips gradients
1124 // during backpropagation if they cross a predetermined threshold.
1125 // This component will be used to prevent gradient explosion problem in
1126 // recurrent neural networks
1127 class ClipGradientComponent: public Component {
1128  public:
1129   ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
1130                         bool norm_based_clipping,
1131                         BaseFloat self_repair_clipped_proportion_threshold,
1132                         BaseFloat self_repair_target,
1133                         BaseFloat self_repair_scale,
1134                         int32 num_clipped,
1135                         int32 count,
1136                         int32 num_self_repaired,
1137                         int32 num_backpropped) {
1138     Init(dim, clipping_threshold, norm_based_clipping,
1139          self_repair_clipped_proportion_threshold,
1140          self_repair_target,
1141          self_repair_scale,
1142          num_clipped, count,
1143          num_self_repaired, num_backpropped);}
1144
1145   ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
1146     norm_based_clipping_(false),
1147     self_repair_clipped_proportion_threshold_(1.0),
1148     self_repair_target_(0.0),
1149     self_repair_scale_(0.0),
1150     num_clipped_(0), count_(0),
1151     num_self_repaired_(0), num_backpropped_(0) { }
1152
1153   virtual int32 InputDim() const { return dim_; }
1154   virtual int32 OutputDim() const { return dim_; }
1155   virtual void InitFromConfig(ConfigLine *cfl);
1156   void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
1157             BaseFloat self_repair_clipped_proportion_threshold,
1158             BaseFloat self_repair_target,
1159             BaseFloat self_repair_scale,
1160             int32 num_clipped, int32 count,
1161             int32 num_self_repaired, int32 num_backpropped);
1162
1163   virtual std::string Type() const { return "ClipGradientComponent"; }
1164
1165   virtual int32 Properties() const {
1166     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace|
1167            kBackpropNeedsInput;
1168   }
1169
1170   virtual void ZeroStats();
1171
1172   virtual Component* Copy() const {
1173     return new ClipGradientComponent(dim_,
1174                                      clipping_threshold_,
1175                                      norm_based_clipping_,
1176                                      self_repair_clipped_proportion_threshold_,
1177                                      self_repair_target_,
1178                                      self_repair_scale_,
1179                                      num_clipped_,
1180                                      count_,
1181                                      num_self_repaired_,
1182                                      num_backpropped_);}
1183
1184   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1185                          const CuMatrixBase<BaseFloat> &in,
1186                          CuMatrixBase<BaseFloat> *out) const;
1187   virtual void Backprop(const std::string &debug_info,
1188                         const ComponentPrecomputedIndexes *indexes,
1189                         const CuMatrixBase<BaseFloat> &in_value,
1190                         const CuMatrixBase<BaseFloat> &, // out_value,
1191                         const CuMatrixBase<BaseFloat> &out_deriv,
1192                         Component *to_update,
1193                         CuMatrixBase<BaseFloat> *in_deriv) const;
1194
1195   virtual void Scale(BaseFloat scale);
1196   virtual void Add(BaseFloat alpha, const Component &other);
1197   virtual void Read(std::istream &is, bool binary); // This Read function
1198   // requires that the Component has the correct type.
1199   /// Write component to stream
1200   virtual void Write(std::ostream &os, bool binary) const;
1201   virtual std::string Info() const;
1202   virtual ~ClipGradientComponent() {
1203     if (num_self_repaired_ > 0)
1204       KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info_
1205                 << ")'s self-repair was activated " << num_self_repaired_
1206                 << " time(s) out of " << num_backpropped_
1207                 << " times of calling Backprop() in this training job.";
1208   }
1209  private:
1210   int32 dim_;  // input/output dimension
1211   BaseFloat clipping_threshold_;  // threshold to be used for clipping
1212                                   // could correspond to max-row-norm (if
1213                                   // norm_based_clipping_ == true) or
1214                                   // max-absolute-value (otherwise)
1215   bool norm_based_clipping_;  // if true the max-row-norm will be clipped
1216                               // else element-wise absolute value clipping is
1217                               // done
1218
1219   // some configuration values relating to self-repairing.
1220   BaseFloat self_repair_clipped_proportion_threshold_; // the threshold of
1221                                                        // clipped-proportion
1222                                                        // for self-repair to be
1223                                                        // activated
1224   BaseFloat self_repair_target_; // the target value towards which self-repair
1225                                  // is trying to set for in-deriv
1226   BaseFloat self_repair_scale_;  // constant scaling the self-repair vector
1227   std::string debug_info_;   // component-node name, used in the destructor to
1228                              // print out stats of self-repair
1229
1230   // this function is called from Backprop code, and only does something if the
1231   // self-repair-scale config value is set and the current clipped proportion
1232   // exceeds the threshold. What it does is to add a term to in-deriv that
1233   // forces the input to the ClipGradientComponent to be close to some small
1234   // value (e.g., 0.0 or 0.5, depending on what the input is, e.g.,
1235   // Sigmoid or Tanh or Affine). The hope is that if the input is forced to be
1236   // small, the parameters on the path will also tend to be small, which may
1237   // help tamp down the divergence caused by gradient explosion.
1238   void RepairGradients(const std::string &debug_info,
1239                        const CuMatrixBase<BaseFloat> &in_value,
1240                        CuMatrixBase<BaseFloat> *in_deriv,
1241                        ClipGradientComponent *to_update) const;
1242
1243   ClipGradientComponent &operator =
1244       (const ClipGradientComponent &other); // Disallow.
1245
1246  protected:
1247   // variables to store stats
1248   // An element corresponds to rows of derivative matrix, when
1249   // norm_based_clipping_ is true,
1250   // else it corresponds to each element of the derivative matrix
1251   // Note: no stats are stored when norm_based_clipping_ is false
1252   int32 num_clipped_;  // number of elements which were clipped
1253   int32 count_;  // number of elements which were processed
1254   int32 num_self_repaired_; // number of times self-repair is activated
1255   int32 num_backpropped_; //number of times backprop is called
1256
1257 };
1258
1259 /** PermuteComponent changes the order of the columns (i.e. the feature or
1260     activation dimensions).  Output dimension i is mapped to input dimension
1261     column_map_[i], so it's like doing:
1262       for each row:
1263         for each feature/activation dimension i:
1264           output(row, i) = input(row, column_map_[i]).
1265
1266 */
1267 class PermuteComponent: public Component {
1268  public:
1269   PermuteComponent()  {}
1270   PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
1271
1272   virtual int32 InputDim() const { return column_map_.Dim(); }
1273   virtual int32 OutputDim() const { return column_map_.Dim(); }
1274   virtual void InitFromConfig(ConfigLine *cfl);
1275   void Init(const std::vector<int32> &column_map);
1276
1277   virtual std::string Type() const { return "PermuteComponent"; }
1278
1279   virtual int32 Properties() const {
1280     return kSimpleComponent|kLinearInInput;
1281   }
1282
1283   virtual void ZeroStats() {}
1284
1285   virtual Component* Copy() const;
1286
1287   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1288                          const CuMatrixBase<BaseFloat> &in,
1289                          CuMatrixBase<BaseFloat> *out) const;
1290   virtual void Backprop(const std::string &debug_info,
1291                         const ComponentPrecomputedIndexes *indexes,
1292                         const CuMatrixBase<BaseFloat> &, //in_value
1293                         const CuMatrixBase<BaseFloat> &, // out_value,
1294                         const CuMatrixBase<BaseFloat> &out_deriv,
1295                         Component *to_update,
1296                         CuMatrixBase<BaseFloat> *in_deriv) const;
1297
1298   virtual void Scale(BaseFloat scale) {}
1299   virtual void Add(BaseFloat alpha, const Component &other) {}
1300   virtual void Read(std::istream &is, bool binary); // This Read function
1301   // requires that the Component has the correct type.
1302   /// Write component to stream
1303   virtual void Write(std::ostream &os, bool binary) const;
1304   virtual std::string Info() const;
1305  private:
1306   // computes the reverse column map.  Must not be called if column_map_.Dim()
1307   // == 0
1308   void ComputeReverseColumnMap();
1309   CuArray<int32> column_map_;
1310   // the following is a derived variable, not written to disk.
1311   // It is used in backprop.
1312   CuArray<int32> reverse_column_map_;
1313   PermuteComponent &operator =
1314       (const PermuteComponent &other); // Disallow.
1315 };
1316
1317
1318
1319
1320 // PerElementScaleComponent scales each dimension of its input with a separate
1321 // trainable scale; it's like a linear component with a diagonal matrix.
1322 class PerElementScaleComponent: public UpdatableComponent {
1323  public:
1324   virtual int32 InputDim() const { return scales_.Dim(); }
1325   virtual int32 OutputDim() const { return scales_.Dim(); }
1326
1327   virtual std::string Info() const;
1328   virtual void InitFromConfig(ConfigLine *cfl);
1329
1330   PerElementScaleComponent() { } // use Init to really initialize.
1331   virtual std::string Type() const { return "PerElementScaleComponent"; }
1332   virtual int32 Properties() const {
1333     return kSimpleComponent|kUpdatableComponent|kLinearInInput|
1334         kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
1335   }
1336
1337   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1338                          const CuMatrixBase<BaseFloat> &in,
1339                          CuMatrixBase<BaseFloat> *out) const;
1340   virtual void Backprop(const std::string &debug_info,
1341                         const ComponentPrecomputedIndexes *indexes,
1342                         const CuMatrixBase<BaseFloat> &in_value,
1343                         const CuMatrixBase<BaseFloat> &, // out_value
1344                         const CuMatrixBase<BaseFloat> &out_deriv,
1345                         Component *to_update,
1346                         CuMatrixBase<BaseFloat> *in_deriv) const;
1347
1348   virtual void Read(std::istream &is, bool binary);
1349   virtual void Write(std::ostream &os, bool binary) const;
1350
1351   virtual Component* Copy() const;
1352
1353
1354   // Some functions from base-class UpdatableComponent.
1355   virtual void Scale(BaseFloat scale);
1356   virtual void Add(BaseFloat alpha, const Component &other);
1357   virtual void PerturbParams(BaseFloat stddev);
1358   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1359   virtual int32 NumParameters() const;
1360   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1361   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1362
1363   // Some functions that are specific to this class.
1364   explicit PerElementScaleComponent(const PerElementScaleComponent &other);
1365
1366   void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
1367   void Init(std::string vector_filename);
1368
1369  protected:
1370   friend class AffineComponent;  // necessary for collapse
1371   // This function Update() is for extensibility; child classes may override
1372   // this, e.g. for natural gradient update.
1373   virtual void Update(
1374       const std::string &debug_info,
1375       const CuMatrixBase<BaseFloat> &in_value,
1376       const CuMatrixBase<BaseFloat> &out_deriv) {
1377     UpdateSimple(in_value, out_deriv);
1378   }
1379   // UpdateSimple is used when *this is a gradient.  Child classes may override
1380   // this if needed, but typically won't need to.
1381   virtual void UpdateSimple(
1382       const CuMatrixBase<BaseFloat> &in_value,
1383       const CuMatrixBase<BaseFloat> &out_deriv);
1384
1385   const PerElementScaleComponent &operator
1386       = (const PerElementScaleComponent &other); // Disallow.
1387   CuVector<BaseFloat> scales_;
1388 };
1389
1390
1391 // PerElementOffsetComponent offsets each dimension of its input with a separate
1392 // trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
1393 class PerElementOffsetComponent: public UpdatableComponent {
1394  public:
1395   virtual int32 InputDim() const { return offsets_.Dim(); }
1396   virtual int32 OutputDim() const { return offsets_.Dim(); }
1397
1398   virtual std::string Info() const;
1399   virtual void InitFromConfig(ConfigLine *cfl);
1400
1401   PerElementOffsetComponent() { } // use Init to really initialize.
1402   virtual std::string Type() const { return "PerElementOffsetComponent"; }
1403   virtual int32 Properties() const {
1404     return kSimpleComponent|kUpdatableComponent|
1405            kBackpropInPlace|kPropagateInPlace;
1406   }
1407
1408   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1409                          const CuMatrixBase<BaseFloat> &in,
1410                          CuMatrixBase<BaseFloat> *out) const;
1411   virtual void Backprop(const std::string &debug_info,
1412                         const ComponentPrecomputedIndexes *indexes,
1413                         const CuMatrixBase<BaseFloat> &, // in_value
1414                         const CuMatrixBase<BaseFloat> &, // out_value
1415                         const CuMatrixBase<BaseFloat> &out_deriv,
1416                         Component *to_update,
1417                         CuMatrixBase<BaseFloat> *in_deriv) const;
1418
1419   virtual void Read(std::istream &is, bool binary);
1420   virtual void Write(std::ostream &os, bool binary) const;
1421
1422   virtual Component* Copy() const;
1423
1424
1425   // Some functions from base-class UpdatableComponent.
1426   virtual void Scale(BaseFloat scale);
1427   virtual void Add(BaseFloat alpha, const Component &other);
1428   virtual void PerturbParams(BaseFloat stddev);
1429   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1430   virtual int32 NumParameters() const;
1431   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1432   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1433
1434   // Some functions that are specific to this class.
1435   explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
1436
1437   void Init(int32 dim, BaseFloat param_mean,
1438             BaseFloat param_stddev);
1439   void Init(std::string vector_filename);
1440
1441  protected:
1442   const PerElementOffsetComponent &operator
1443       = (const PerElementOffsetComponent &other); // Disallow.
1444   CuVector<BaseFloat> offsets_;
1445 };
1446
1447
1448 // ConstantFunctionComponent returns constant function of its input,
1449 // i.e. its output does not depend on its input.  It is the same as
1450 // an affine component with the linear term fixed at zero.
1451 // It is optionally trainable, and optionally you can use natural
1452 // gradient.  The input is required only because it's more convenient
1453 // to make SimpleComponents [but see ConstantComponent, which requires
1454 // no inputs].
1455 class ConstantFunctionComponent: public UpdatableComponent {
1456  public:
1457   virtual int32 InputDim() const { return input_dim_; }
1458   virtual int32 OutputDim() const { return output_.Dim(); }
1459
1460   virtual std::string Info() const;
1461   // possible parameter values with their defaults:
1462   // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
1463   // output-mean=0 output-stddev=0
1464   virtual void InitFromConfig(ConfigLine *cfl);
1465
1466   ConstantFunctionComponent();
1467
1468   ConstantFunctionComponent(const ConstantFunctionComponent &other);
1469
1470   virtual std::string Type() const { return "ConstantFunctionComponent"; }
1471   virtual int32 Properties() const {
1472     return kSimpleComponent|
1473         (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
1474         (InputDim() == OutputDim() ? kPropagateInPlace: 0) |
1475         kBackpropAdds;
1476   }
1477   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1478                          const CuMatrixBase<BaseFloat> &in,
1479                          CuMatrixBase<BaseFloat> *out) const;
1480   virtual void Backprop(const std::string &debug_info,
1481                         const ComponentPrecomputedIndexes *indexes,
1482                         const CuMatrixBase<BaseFloat> &, // in_value
1483                         const CuMatrixBase<BaseFloat> &, // out_value
1484                         const CuMatrixBase<BaseFloat> &out_deriv,
1485                         Component *to_update,
1486                         CuMatrixBase<BaseFloat> *in_deriv) const;
1487
1488   virtual void Read(std::istream &is, bool binary);
1489   virtual void Write(std::ostream &os, bool binary) const;
1490
1491   virtual Component* Copy() const;
1492
1493   // Some functions from base-class UpdatableComponent.
1494   virtual void Scale(BaseFloat scale);
1495   virtual void Add(BaseFloat alpha, const Component &other);
1496   virtual void PerturbParams(BaseFloat stddev);
1497   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1498   virtual int32 NumParameters() const;
1499   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1500   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1501  private:
1502   int32 input_dim_;
1503   // the output value-- a vector.
1504   CuVector<BaseFloat> output_;
1505
1506   bool is_updatable_;
1507   // if true, and if updatable, do natural-gradient update.
1508   bool use_natural_gradient_;
1509   OnlineNaturalGradient preconditioner_;
1510
1511   const ConstantFunctionComponent &operator
1512   = (const ConstantFunctionComponent &other); // Disallow.
1513 };
1514
1515
1516
1517 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
1518 // it uses a natural gradient update for the per-element scales, and enforces a
1519 // maximum amount of change per minibatch, for stability.
1520 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
1521  public:
1522
1523   virtual std::string Info() const;
1524
1525   virtual void InitFromConfig(ConfigLine *cfl);
1526
1527   NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
1528   virtual std::string Type() const {
1529     return "NaturalGradientPerElementScaleComponent";
1530   }
1531
1532   virtual void Read(std::istream &is, bool binary);
1533   virtual void Write(std::ostream &os, bool binary) const;
1534
1535   virtual Component* Copy() const;
1536
1537   // Some functions that are specific to this class:
1538   explicit NaturalGradientPerElementScaleComponent(
1539       const NaturalGradientPerElementScaleComponent &other);
1540
1541   void Init(int32 dim, BaseFloat param_mean,
1542             BaseFloat param_stddev, int32 rank, int32 update_period,
1543             BaseFloat num_samples_history, BaseFloat alpha);
1544   void Init(std::string vector_filename,
1545             int32 rank, int32 update_period, BaseFloat num_samples_history,
1546             BaseFloat alpha);
1547
1548  private:
1549   // unlike the NaturalGradientAffineComponent, there is only one dimension to
1550   // consider as the parameters are a vector not a matrix, so we only need one
1551   // preconditioner.
1552   // The preconditioner stores its own configuration values; we write and read
1553   // these, but not the preconditioner object itself.
1554   OnlineNaturalGradient preconditioner_;
1555
1556   // Override of the parent-class Update() function, called only
1557   // if this->is_gradient_ = false; this implements the natural
1558   // gradient update.
1559   virtual void Update(
1560       const std::string &debug_info,
1561       const CuMatrixBase<BaseFloat> &in_value,
1562       const CuMatrixBase<BaseFloat> &out_deriv);
1563
1564   const NaturalGradientPerElementScaleComponent &operator
1565       = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
1566 };
1567
1568 /**
1569  * ConvolutionalComponent implements 2d-convolution.
1570  * It uses 3D filters on 3D inputs, but the 3D filters hop only over
1571  * 2 dimensions as it has same size as the input along the 3rd dimension.
1572  * Input : A matrix where each row is a  vectorized 3D-tensor.
1573  *        The 3D tensor has dimensions
1574  *        x: (e.g. time)
1575  *        y: (e.g. frequency)
1576  *        z: (e.g. channels like features/delta/delta-delta)
1577  *
1578  *        The component supports input vectorizations of type zyx and yzx.
1579  *        The default vectorization type is zyx.
1580  *        e.g. for input vectorization of type zyx the input is vectorized by
1581  *        spanning axes z, y and x of the tensor in that order.
1582  *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1583  *        the zyx vectorized input looks like
1584  *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1585  *
1586  *
1587  * Output : The output is also a 3D tensor vectorized in the zyx format.
1588  *          The channel axis (z) in the output corresponds to the output of
1589  *          different filters. The first channel corresponds to the first filter
1590  *          i.e., first row of the filter_params_ matrix.
1591  *
1592  * Note: The component has to support yzx input vectorization as the binaries
1593  * like add-deltas generate yz vectorized output. These input vectors are
1594  * concatenated using the Append descriptor across time steps to form a yzx
1595  * vectorized 3D tensor input.
1596  * e.g. Append(Offset(input, -1), input, Offset(input, 1))
1597  *
1598  *
1599  * For information on the hyperparameters and parameters of this component see
1600  * the variable declarations.
1601  *
1602  * Propagation:
1603  * ------------
1604  * Convolution operation consists of a dot-products between the filter tensor
1605  * and input tensor patch, for various shifts of filter tensor along the x and y
1606  * axes input tensor. (Note: there is no shift along z-axis as the filter and
1607  * input tensor have same size along this axis).
1608  *
1609  * For a particular shift (i,j) of the filter tensor
1610  * along input tensor dimensions x and y, the elements of the input tensor which
1611  * overlap with the filter form the input tensor patch. This patch is vectorized
1612  * in zyx format. All the patches corresponding to various samples in the
1613  * mini-batch are stacked into a matrix, where each row corresponds to one
1614  * patch. Let this matrix be represented by X_{i,j}. The dot products with
1615  * various filters are computed simultaneously by computing the matrix product
1616  * with the filter_params_ matrix (W)
1617  * Y_{i,j} = X_{i,j}*W^T.
1618  * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
1619  *
1620  * All the matrix products corresponding to various shifts (i,j) of the
1621  * filter tensor are computed simultaneously using the AddMatMatBatched
1622  * call of CuMatrixBase class.
1623  *
1624  * BackPropagation:
1625  * ----------------
1626  *  Backpropagation to compute the input derivative (\nabla X_{i,j})
1627  *  consists of the a series of matrix products.
1628  *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
1629  *   output derivative for a particular shift of the filter.
1630  *
1631  *   Once again these matrix products are computed simultaneously.
1632  *
1633  * Update:
1634  * -------
1635  *  The weight gradient is computed as
1636  *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
1637  *
1638  */
1639 class ConvolutionComponent: public UpdatableComponent {
1640  public:
1641   enum TensorVectorizationType  {
1642     kYzx = 0,
1643     kZyx = 1
1644   };
1645
1646   ConvolutionComponent();
1647   // constructor using another component
1648   ConvolutionComponent(const ConvolutionComponent &component);
1649   // constructor using parameters
1650   ConvolutionComponent(
1651     const CuMatrixBase<BaseFloat> &filter_params,
1652     const CuVectorBase<BaseFloat> &bias_params,
1653     int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1654     int32 filt_x_dim, int32 filt_y_dim,
1655     int32 filt_x_step, int32 filt_y_step,
1656     TensorVectorizationType input_vectorization,
1657     BaseFloat learning_rate);
1658
1659   virtual int32 InputDim() const;
1660   virtual int32 OutputDim() const;
1661
1662   virtual std::string Info() const;
1663   virtual void InitFromConfig(ConfigLine *cfl);
1664   virtual std::string Type() const { return "ConvolutionComponent"; }
1665   virtual int32 Properties() const {
1666     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
1667            kBackpropAdds|kPropagateAdds;
1668   }
1669
1670   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1671                          const CuMatrixBase<BaseFloat> &in,
1672                          CuMatrixBase<BaseFloat> *out) const;
1673   virtual void Backprop(const std::string &debug_info,
1674                         const ComponentPrecomputedIndexes *indexes,
1675                         const CuMatrixBase<BaseFloat> &in_value,
1676                         const CuMatrixBase<BaseFloat> &, // out_value,
1677                         const CuMatrixBase<BaseFloat> &out_deriv,
1678                         Component *to_update_in,
1679                         CuMatrixBase<BaseFloat> *in_deriv) const;
1680   void Update(const std::string &debug_info,
1681               const CuMatrixBase<BaseFloat> &in_value,
1682               const CuMatrixBase<BaseFloat> &out_deriv,
1683               const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
1684
1685
1686   virtual void Read(std::istream &is, bool binary);
1687   virtual void Write(std::ostream &os, bool binary) const;
1688
1689   virtual Component* Copy() const;
1690
1691   // Some functions from base-class UpdatableComponent.
1692   virtual void Scale(BaseFloat scale);
1693   virtual void Add(BaseFloat alpha, const Component &other);
1694   virtual void PerturbParams(BaseFloat stddev);
1695   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1696   virtual int32 NumParameters() const;
1697   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1698   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1699
1700   // Some functions that are specific to this class.
1701   void SetParams(const VectorBase<BaseFloat> &bias,
1702                  const MatrixBase<BaseFloat> &filter);
1703   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
1704   const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
1705   void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1706             int32 filt_x_dim, int32 filt_y_dim,
1707             int32 filt_x_step, int32 filt_y_step, int32 num_filters,
1708             TensorVectorizationType input_vectorization,
1709             BaseFloat param_stddev, BaseFloat bias_stddev);
1710   // there is no filt_z_dim parameter as the length of the filter along
1711   // z-dimension is same as the input
1712   void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
1713             int32 filt_x_dim, int32 filt_y_dim,
1714             int32 filt_x_step, int32 filt_y_step,
1715             TensorVectorizationType input_vectorization,
1716             std::string matrix_filename);
1717
1718   // resize the component, setting the parameters to zero, while
1719   // leaving any other configuration values the same
1720   void Resize(int32 input_dim, int32 output_dim);
1721
1722   void Update(const std::string &debug_info,
1723               const CuMatrixBase<BaseFloat> &in_value,
1724               const CuMatrixBase<BaseFloat> &out_deriv);
1725
1726
1727  private:
1728   int32 input_x_dim_;   // size of the input along x-axis
1729                         // (e.g. number of time steps)
1730
1731   int32 input_y_dim_;   // size of input along y-axis
1732                         // (e.g. number of mel-frequency bins)
1733
1734   int32 input_z_dim_;   // size of input along z-axis
1735                         // (e.g. number of channels is 3 if the input has
1736                         // features + delta + delta-delta features
1737
1738   int32 filt_x_dim_;    // size of the filter along x-axis
1739
1740   int32 filt_y_dim_;    // size of the filter along y-axis
1741
1742   // there is no filt_z_dim_ as it is always assumed to be
1743   // the same as input_z_dim_
1744
1745   int32 filt_x_step_;   // the number of steps taken along x-axis of input
1746                         //  before computing the next dot-product
1747                         //  of filter and input
1748
1749   int32 filt_y_step_;   // the number of steps taken along y-axis of input
1750                         // before computing the next dot-product of the filter
1751                         // and input
1752
1753   // there is no filt_z_step_ as only dot product is possible along this axis
1754
1755   TensorVectorizationType input_vectorization_; // type of vectorization of the
1756   // input 3D tensor. Accepts zyx and yzx formats
1757
1758   CuMatrix<BaseFloat> filter_params_;
1759   // the filter (or kernel) matrix is a matrix of vectorized 3D filters
1760   // where each row in the matrix corresponds to one filter.
1761   // The 3D filter tensor is vectorizedin zyx format.
1762   // The first row of the matrix corresponds to the first filter and so on.
1763   // Keep in mind the vectorization type and order of filters when using file
1764   // based initialization.
1765
1766   CuVector<BaseFloat> bias_params_;
1767   // the filter-specific bias vector (i.e., there is a seperate bias added
1768   // to the output of each filter).
1769   bool is_gradient_;
1770
1771   void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
1772                            CuMatrix<BaseFloat> *patches) const;
1773   void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
1774                                CuMatrixBase<BaseFloat> *in_deriv) const;
1775   const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
1776 };
1777
1778
1779 /*
1780   LstmNonlinearityComponent is a component that implements part of an LSTM, by
1781   combining together the sigmoids and tanh's, plus some diagonal terms, into
1782   a single block.
1783   We will refer to the LSTM formulation used in
1784
1785   Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
1786   by H. Sak et al,
1787   http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
1788
1789   Suppose the cell dimension is C.  Then outside this component, we compute
1790   the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
1791   matrix multiplication:
1792
1793   i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
1794   f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
1795   c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
1796   o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
1797
1798   The part of the computation that takes place in this component is as follows.
1799   Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
1800   c_{t-1}).  Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
1801
1802   To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
1803
1804
1805   This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
1806   and w_o.
1807
1808
1809   In the forward pass (Propagate), this component computes the following:
1810
1811      i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
1812      f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
1813      c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
1814      o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
1815      m_t = o_t * Tanh(c_t)                    (5)
1816     # note: the outputs are just c_t and m_t.
1817
1818   The backprop is as you would think, but for the "self-repair" we need to pass
1819   in additional vectors (of the same dim as the parameters of the layer) that
1820   dictate whether or not we add an additional term to the backpropagated
1821   derivatives.  (This term helps force the input to the nonlinearities into the
1822   range where the derivatives are not too small).
1823
1824   This component stores stats of the same form as are normally stored by the
1825   StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
1826   activations and derivatives, but this is done inside the Backprop() functions.
1827   [the StoreStats() functions don't take the input data as an argument, so
1828   storing this data that way is impossible, and anyway it's more efficient to
1829   do it as part of backprop.]
1830
1831   Configuration values accepted:
1832          cell-dim          e.g. cell-dim=1024  Cell dimension.  The input
1833                           dimension of this component is cell-dim * 5, and the
1834                           output dimension is cell-dim * 2.  Note: this
1835                           component implements only part of the LSTM layer,
1836                           see comments above.
1837          param-stddev     Standard deviation for random initialization of
1838                           the diagonal matrices (AKA peephole connections).
1839                           default=1.0, which is probably too high but
1840                           we couldn't see any reliable gain from decreasing it.
1841          tanh-self-repair-threshold   Equivalent to the self-repair-lower-threshold
1842                           in a TanhComponent; applies to both the tanh nonlinearities.
1843                           default=0.2, you probably won't want to changethis.
1844          sigmoid-self-repair-threshold   Equivalent to self-repair-lower-threshold
1845                           in a SigmoidComponent; applies to all three of the sigmoid
1846                           nonlinearities.  default=0.05, you probably won't want to
1847                           change this.
1848          self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
1849                           or TanhComponent; applies to both the sigmoid and tanh
1850                           nonlinearities.  default=1.0e-05, which you probably won't
1851                           want to change unless dealing with an objective function
1852                           that has smaller or larger dynamic range than normal, in
1853                           which case you might want to make it smaller or larger.
1854 */
1855 class LstmNonlinearityComponent: public UpdatableComponent {
1856  public:
1857
1858   virtual int32 InputDim() const;
1859   virtual int32 OutputDim() const;
1860   virtual std::string Info() const;
1861   virtual void InitFromConfig(ConfigLine *cfl);
1862   LstmNonlinearityComponent() { } // use Init to really initialize.
1863   virtual std::string Type() const { return "LstmNonlinearityComponent"; }
1864   virtual int32 Properties() const {
1865     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
1866   }
1867
1868   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
1869                          const CuMatrixBase<BaseFloat> &in,
1870                          CuMatrixBase<BaseFloat> *out) const;
1871   virtual void Backprop(const std::string &debug_info,
1872                         const ComponentPrecomputedIndexes *indexes,
1873                         const CuMatrixBase<BaseFloat> &in_value,
1874                         const CuMatrixBase<BaseFloat> &, // out_value,
1875                         const CuMatrixBase<BaseFloat> &out_deriv,
1876                         Component *to_update_in,
1877                         CuMatrixBase<BaseFloat> *in_deriv) const;
1878
1879   virtual void Read(std::istream &is, bool binary);
1880   virtual void Write(std::ostream &os, bool binary) const;
1881
1882   virtual Component* Copy() const;
1883
1884   // Some functions from base-class UpdatableComponent.
1885   virtual void Scale(BaseFloat scale);
1886   virtual void Add(BaseFloat alpha, const Component &other);
1887   virtual void PerturbParams(BaseFloat stddev);
1888   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
1889   virtual int32 NumParameters() const;
1890   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
1891   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
1892   virtual void ZeroStats();
1893
1894   // Some functions that are specific to this class:
1895   explicit LstmNonlinearityComponent(
1896       const LstmNonlinearityComponent &other);
1897
1898   void Init(int32 cell_dim, BaseFloat param_stddev,
1899             BaseFloat tanh_self_repair_threshold,
1900             BaseFloat sigmoid_self_repair_threshold,
1901             BaseFloat self_repair_scale);
1902
1903   void Init(std::string vector_filename,
1904             int32 rank, int32 update_period, BaseFloat num_samples_history,
1905             BaseFloat alpha);
1906
1907  private:
1908
1909   // Initializes the natural-gradient object with the configuration we
1910   // use for this object, which for now is hardcoded at the C++ level.
1911   void InitNaturalGradient();
1912
1913
1914   // Notation: C is the cell dimension; it equals params_.NumCols().
1915
1916   // The dimension of the parameter matrix is (3 x C);
1917   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
1918   CuMatrix<BaseFloat> params_;
1919
1920   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1921   // equations (1) through (5), this is the sum of the values of the nonliearities
1922   // (used for diagnostics only).  It is comparable to value_sum_ vector
1923   // in base-class NonlinearComponent.
1924   CuMatrix<double> value_sum_;
1925
1926   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
1927   // equations (1) through (5), this is the sum of the derivatives of the
1928   // nonliearities (used for diagnostics and to control self-repair).  It is
1929   // comparable to the deriv_sum_ vector in base-class
1930   // NonlinearComponent.
1931   CuMatrix<double> deriv_sum_;
1932
1933   // This matrix has dimension 10.  The contents are a block of 5 self-repair
1934   // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
1935   // self-repair scales (typically all 0.00001).  These are for each of the 5
1936   // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
1937   // more info).
1938   CuVector<BaseFloat> self_repair_config_;
1939
1940   // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
1941   // component (see comments in cu-math.h for more info), it contains the total,
1942   // over all frames represented in count_, of the number of dimensions that
1943   // were subject to self_repair.  To get the self-repair proportion you should
1944   // divide by (count_ times cell_dim_).
1945   CuVector<double> self_repair_total_;
1946
1947   // The total count (number of frames) corresponding to the stats in value_sum_
1948   // and deriv_sum_.
1949   double count_;
1950
1951   // Preconditioner for the parameters of this component [operates in the space
1952   // of dimension C].
1953   // The preconditioner stores its own configuration values; we write and read
1954   // these, but not the preconditioner object itself.
1955   OnlineNaturalGradient preconditioner_;
1956
1957   const LstmNonlinearityComponent &operator
1958       = (const LstmNonlinearityComponent &other); // Disallow.
1959 };
1960
1961
1962
1963
1964 /*
1965  * MaxPoolingComponent :
1966  * Maxpooling component was firstly used in ConvNet for selecting an
1967  * representative activation in an area. It inspired Maxout nonlinearity.
1968  * Each output element of this component is the maximum of a block of
1969  * input elements where the block has a 3D dimension (pool_x_size_,
1970  * pool_y_size_, pool_z_size_).
1971  * Blocks could overlap if the shift value on any axis is smaller
1972  * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
1973  * If the shift values are euqal to their pool size, there is no
1974  * overlap; while if they all equal 1, the blocks overlap to
1975  * the greatest possible extent.
1976  *
1977  * This component is designed to be used after a ConvolutionComponent
1978  * so that the input matrix is propagated from a 2d-convolutional layer.
1979  * This component implements 3d-maxpooling which performs
1980  * max pooling along the three axes.
1981  * Input : A matrix where each row is a vectorized 3D-tensor.
1982  *        The 3D tensor has dimensions
1983  *        x: (e.g. time)
1984  *        y: (e.g. frequency)
1985  *        z: (e.g. channels like number of filters in the ConvolutionComponent)
1986  *
1987  *        The component assumes input vectorizations of type zyx
1988  *        which is the default output vectorization type of a ConvolutionComponent.
1989  *        e.g. for input vectorization of type zyx the input is vectorized by
1990  *        spanning axes z, y and x of the tensor in that order.
1991  *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
1992  *        the zyx vectorized input looks like
1993  *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
1994  *
1995  * Output : The output is also a 3D tensor vectorized in the zyx format.
1996  *
1997  * For information on the hyperparameters and parameters of this component see
1998  * the variable declarations.
1999  *
2000  *
2001  */
2002
2003 class MaxpoolingComponent: public Component {
2004  public:
2005
2006   MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
2007                            pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
2008                            pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
2009   // constructor using another component
2010   MaxpoolingComponent(const MaxpoolingComponent &component);
2011
2012   virtual int32 InputDim() const;
2013   virtual int32 OutputDim() const;
2014   virtual void Check() const;
2015
2016   virtual std::string Info() const;
2017   virtual void InitFromConfig(ConfigLine *cfl);
2018   virtual std::string Type() const { return "MaxpoolingComponent"; }
2019   virtual int32 Properties() const {
2020     return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
2021            kBackpropAdds;
2022   }
2023
2024   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
2025                          const CuMatrixBase<BaseFloat> &in,
2026                          CuMatrixBase<BaseFloat> *out) const;
2027   virtual void Backprop(const std::string &debug_info,
2028                         const ComponentPrecomputedIndexes *indexes,
2029                         const CuMatrixBase<BaseFloat> &in_value,
2030                         const CuMatrixBase<BaseFloat> &out_value,
2031                         const CuMatrixBase<BaseFloat> &out_deriv,
2032                         Component *, // to_update,
2033                         CuMatrixBase<BaseFloat> *in_deriv) const;
2034
2035   virtual void Read(std::istream &is, bool binary); // This Read function
2036   // requires that the Component has the correct type.
2037
2038   /// Write component to stream
2039   virtual void Write(std::ostream &os, bool binary) const;
2040   virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
2041
2042   void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
2043                            CuMatrix<BaseFloat> *patches) const;
2044   void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
2045                                CuMatrixBase<BaseFloat> *in_deriv) const;
2046
2047  protected:
2048   int32 input_x_dim_;   // size of the input along x-axis
2049   // (e.g. number of time steps)
2050   int32 input_y_dim_;   // size of input along y-axis
2051   // (e.g. number of mel-frequency bins)
2052   int32 input_z_dim_;   // size of input along z-axis
2053   // (e.g. number of filters in the ConvolutionComponent)
2054
2055   int32 pool_x_size_;    // size of the pooling window along x-axis
2056   int32 pool_y_size_;    // size of the pooling window along y-axis
2057   int32 pool_z_size_;    // size of the pooling window along z-axis
2058
2059   int32 pool_x_step_;   // the number of steps taken along x-axis of input
2060   //  before computing the next pool
2061   int32 pool_y_step_;   // the number of steps taken along y-axis of input
2062   // before computing the next pool
2063   int32 pool_z_step_;   // the number of steps taken along z-axis of input
2064   // before computing the next pool
2065
2066 };
2067
2068
2069 /**
2070    CompositeComponent is a component representing a sequence of
2071    [simple] components.  The config line would be something like the following
2072    (imagine this is all on one line):
2073
2074    component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
2075       component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
2076       component2='type=RectifiedLinearComponent dim=10000' \
2077       component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
2078
2079    The reason you might want to use this component, instead of directly using
2080    the same sequence of components in the config file, is to save GPU memory (at
2081    the expense of more compute)-- because doing it like this means we have to
2082    re-do parts of the forward pass in the backprop phase, but we avoid using
2083    much memory for very long (and you can make the memory usage very small by
2084    making max-rows-process small).  We inherit from UpdatableComponent just in
2085    case one or more of the components in the sequence are updatable.
2086
2087    It is an error to nest a CompositeComponent inside a CompositeComponent.
2088    The same effect can be accomplished by specifying a smaller max-rows-process
2089    in a single CompositeComponent.
2090  */
2091 class CompositeComponent: public UpdatableComponent {
2092  public:
2093   virtual int32 InputDim() const;
2094   virtual int32 OutputDim() const;
2095
2096   virtual std::string Info() const;
2097
2098   virtual void InitFromConfig(ConfigLine *cfl);
2099
2100   virtual Component* Copy() const;
2101
2102   CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
2103
2104   // Initialize from this list of components; takes ownership of the pointers.
2105   void Init(const std::vector<Component*> &components,
2106             int32 max_rows_process);
2107
2108   virtual std::string Type() const { return "CompositeComponent"; }
2109
2110   // The properties depend on the properties of the constituent components.  As
2111   // a special case, we never return kStoresStats in the properties: by default
2112   // we store things like activation stats (e.g. for nonlinear components like
2113   // ReLU) as part of the backprop.  This means we may wastefully store stats
2114   // even when not requested, but it does save time as a separate StoreStats()
2115   // call would involve propagating the internals.
2116   virtual int32 Properties() const;
2117
2118   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
2119                          const CuMatrixBase<BaseFloat> &in,
2120                          CuMatrixBase<BaseFloat> *out) const;
2121   virtual void Backprop(const std::string &debug_info,
2122                         const ComponentPrecomputedIndexes *indexes,
2123                         const CuMatrixBase<BaseFloat> &in_value,
2124                         const CuMatrixBase<BaseFloat> &, // out_value
2125                         const CuMatrixBase<BaseFloat> &out_deriv,
2126                         Component *to_update,
2127                         CuMatrixBase<BaseFloat> *in_deriv) const;
2128
2129   // note, we don't implement StoreStats() as it would be inefficient.  Instead,
2130   // by default we call StoreStats() on all members that have the flag set,
2131   // inside the Backprop.
2132   virtual void ZeroStats();
2133
2134   virtual void Read(std::istream &is, bool binary);
2135   virtual void Write(std::ostream &os, bool binary) const;
2136
2137   // Don't implement Copy() at this level: implement it in the child class.
2138
2139   // Some functions from base-class UpdatableComponent.
2140   virtual void SetUnderlyingLearningRate(BaseFloat lrate);
2141   virtual void SetActualLearningRate(BaseFloat lrate);
2142   virtual void SetAsGradient();
2143   virtual void Scale(BaseFloat scale);
2144   virtual void Add(BaseFloat alpha, const Component &other);
2145   virtual void PerturbParams(BaseFloat stddev);
2146   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
2147   virtual int32 NumParameters() const;
2148   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
2149   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
2150
2151   // note: we dont implement the StoreStats function as it would be quite
2152   // expensive; instead, by default we call StoreStats() for any components that
2153   // want to store stats, as part of the backprop pass.  This is not 100% ideal
2154   // but it will usually do what you want.  We can revisit this later if needed.
2155
2156   // Functions to iterate over the internal components
2157
2158   int32 NumComponents() const { return components_.size();}
2159   /// Gets the ith component in this component.
2160   /// The ordering is the same as in the config line. The caller
2161   /// does not own the received component.
2162   const Component* GetComponent(int32 i) const;
2163   /// Sets the ith component. After this call, CompositeComponent owns
2164   /// the reference to the argument component. Frees the previous
2165   /// ith component.
2166   void SetComponent(int32 i, Component *component);
2167
2168   virtual ~CompositeComponent() { DeletePointers(&components_); }
2169  private:
2170   // returns the stride type, kDefaultStride or kStrideEqualNumCols,
2171   // at the output of the i'th component.
2172   inline MatrixStrideType GetStrideType(int32 i) const;
2173
2174   // returns true if at least one of 'components_' returns the kUpdatable flag
2175   // in its flags.
2176   bool IsUpdatable() const;
2177
2178   // the maximum number of
2179   int32 max_rows_process_;
2180   std::vector<Component*> components_;
2181
2182 };
2183
2184
2185 } // namespace nnet3
2186 } // namespace kaldi
2187
2188
2189 #endif