summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 6f59867)
raw | patch | inline | side by side (parent: 6f59867)
author | Pegah Ghahremani <pegahgh@gmail.com> | |
Wed, 1 Oct 2014 19:37:53 +0000 (19:37 +0000) | ||
committer | Pegah Ghahremani <pegahgh@gmail.com> | |
Wed, 1 Oct 2014 19:37:53 +0000 (19:37 +0000) |
src/nnet2/nnet-component-test.cc | patch | blob | history | |
src/nnet2/nnet-component.cc | patch | blob | history | |
src/nnet2/nnet-component.h | patch | blob | history |
index e7441f3d4f1e59d3cc878fe08cacd45e488fac6b..8ec02168b95b63c8b6ce3230d5f73316b1843e4e 100644 (file)
}
}
-
-void UnitTestPiecewiseLinearComponent() {
- BaseFloat learning_rate = 0.01, max_change = 0.1 * (Rand() % 2);
- int32 dim = 5 + Rand() % 10, N = 3 + 2 * (Rand() % 5);
- {
- PiecewiseLinearComponent component;
- component.Init(dim, N, learning_rate, max_change);
- UnitTestGenericComponentInternal(component);
- }
- {
- const char *str = "learning-rate=0.01 dim=10 N=5 max-change=0.01";
- PiecewiseLinearComponent component;
- component.InitFromString(str);
- UnitTestGenericComponentInternal(component);
- }
-}
-
-
-
void UnitTestScaleComponent() {
int32 dim = 1 + Rand() % 10;
BaseFloat scale = 0.1 + Rand() % 3;
}
}
-
-void UnitTestAffineComponentModified() {
- BaseFloat learning_rate = 0.01,
- param_stddev = 0.1, bias_stddev = 1.0, length_cutoff = 10.0,
- max_change = 0.1;
- int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
- {
- AffineComponentModified component;
- if (Rand() % 2 == 0) {
- component.Init(learning_rate, input_dim, output_dim,
- param_stddev, bias_stddev,
- length_cutoff, max_change);
- } else {
- Matrix<BaseFloat> mat(output_dim + 1, input_dim);
- mat.SetRandn();
- mat.Scale(param_stddev);
- WriteKaldiObject(mat, "tmpf", true);
- sleep(1);
- component.Init(learning_rate, length_cutoff, max_change, "tmpf");
- unlink("tmpf");
- }
- UnitTestGenericComponentInternal(component);
- }
- {
- const char *str = "learning-rate=0.01 input-dim=16 output-dim=15 param-stddev=0.1 cutoff-length=10.0 max-change=0.01";
- AffineComponentModified component;
- component.InitFromString(str);
- UnitTestGenericComponentInternal(component);
- }
-}
-
-
-void UnitTestAffinePreconInputComponent() {
- BaseFloat learning_rate = 0.01,
- param_stddev = 0.1, bias_stddev = 1.0,
- avg_samples = 100.0;
- int32 input_dim = 5 + Rand() % 10, output_dim = 5 + Rand() % 10;
-
- {
- AffinePreconInputComponent component;
- component.Init(learning_rate, input_dim, output_dim,
- param_stddev, bias_stddev, avg_samples);
- UnitTestGenericComponentInternal(component);
- }
- {
- const char *str = "learning-rate=0.01 input-dim=10 output-dim=15 param-stddev=0.1 avg-samples=100";
- AffinePreconInputComponent component;
- component.InitFromString(str);
- UnitTestGenericComponentInternal(component);
- }
-}
-
void UnitTestBlockAffineComponent() {
BaseFloat learning_rate = 0.01,
param_stddev = 0.1, bias_stddev = 0.1;
}
}
-void UnitTestMixtureProbComponent() {
- BaseFloat learning_rate = 0.01,
- diag_element = 0.8;
- std::vector<int32> sizes;
- int32 num_sizes = 1 + Rand() % 5; // allow
- for (int32 i = 0; i < num_sizes; i++)
- sizes.push_back(2 + Rand() % 5); // TODO: change to 1 + Rand() % 5
- // and fix test errors. May be issue in the code itself.
-
-
- {
- MixtureProbComponent component;
- component.Init(learning_rate, diag_element, sizes);
- UnitTestGenericComponentInternal(component);
- }
- {
- const char *str = "learning-rate=0.01 diag-element=0.9 dims=3:4:5";
- MixtureProbComponent component;
- component.InitFromString(str);
- UnitTestGenericComponentInternal(component);
- }
-}
-
void UnitTestSumGroupComponent() {
std::vector<int32> sizes;
UnitTestGenericComponent<SoftmaxComponent>();
UnitTestGenericComponent<RectifiedLinearComponent>();
UnitTestGenericComponent<SoftHingeComponent>();
- UnitTestGenericComponent<PowerExpandComponent>("higher-power-scale=0.1");
UnitTestMaxoutComponent();
UnitTestPnormComponent();
UnitTestGenericComponent<NormalizeComponent>();
UnitTestSigmoidComponent();
UnitTestAffineComponent();
- UnitTestPiecewiseLinearComponent();
UnitTestScaleComponent();
- UnitTestAffinePreconInputComponent();
UnitTestBlockAffineComponent();
UnitTestBlockAffineComponentPreconditioned();
- UnitTestMixtureProbComponent();
UnitTestSumGroupComponent();
UnitTestDctComponent();
UnitTestFixedLinearComponent();
UnitTestFixedBiasComponent();
UnitTestAffineComponentPreconditioned();
UnitTestAffineComponentPreconditionedOnline();
- UnitTestAffineComponentModified();
UnitTestDropoutComponent();
UnitTestAdditiveNoiseComponent();
UnitTestParsing();
index 0d2d4ddd2b1d3d8b1f6b32af6655904609001d86..5112f9930d188259a4cb5dd9b5bb547064af00ba 100644 (file)
ans = new MaxoutComponent();
} else if (component_type == "ScaleComponent") {
ans = new ScaleComponent();
- } else if (component_type == "PowerExpandComponent") {
- ans = new PowerExpandComponent();
} else if (component_type == "AffineComponent") {
ans = new AffineComponent();
- } else if (component_type == "PiecewiseLinearComponent") {
- ans = new PiecewiseLinearComponent();
- } else if (component_type == "AffineComponentA") {
- ans = new AffineComponentA();
} else if (component_type == "AffineComponentPreconditioned") {
ans = new AffineComponentPreconditioned();
} else if (component_type == "AffineComponentPreconditionedOnline") {
ans = new AffineComponentPreconditionedOnline();
- } else if (component_type == "AffineComponentModified") {
- ans = new AffineComponentModified();
- } else if (component_type == "AffinePreconInputComponent") {
- ans = new AffinePreconInputComponent();
- } else if (component_type == "MixtureProbComponent") {
- ans = new MixtureProbComponent();
} else if (component_type == "SumGroupComponent") {
ans = new SumGroupComponent();
} else if (component_type == "BlockAffineComponent") {
}
-void PowerExpandComponent::Init(int32 dim,
- int32 max_power,
- BaseFloat higher_power_scale) {
- input_dim_ = dim;
- max_power_ = max_power;
- higher_power_scale_ = higher_power_scale;
- KALDI_ASSERT(input_dim_ > 0 && max_power >= 1 && higher_power_scale > 0.0);
-}
-
-void PowerExpandComponent::InitFromString(std::string args) {
- std::string orig_args(args);
- int32 dim, max_power = 2;
- BaseFloat higher_power_scale = 1.0;
- ParseFromString("max-power", &args, &max_power); // Optional.
- ParseFromString("higher-power-scale", &args, &higher_power_scale); // Optional.
- // Accept either "dim" or "input-dim" to specify the input dim.
- // "input-dim" is the canonical one; "dim" simplifies the testing code.
- bool ok = (ParseFromString("dim", &args, &dim) ||
- ParseFromString("input-dim", &args, &dim));
- if (!ok || !args.empty() || dim <= 0)
- KALDI_ERR << "Invalid initializer for layer of type "
- << Type() << ": \"" << orig_args << "\"";
- Init(dim, max_power, higher_power_scale);
-}
-
-
-void PowerExpandComponent::Propagate(const CuMatrixBase<BaseFloat> &in,
- int32 num_chunks,
- CuMatrix<BaseFloat> *out) const {
- out->Resize(in.NumRows(), in.NumCols() * max_power_, kUndefined);
- for (int32 p = 1; p <= max_power_; p++) {
- CuSubMatrix<BaseFloat> out_part(*out, 0, in.NumRows(),
- in.NumCols() * (p - 1), in.NumCols());
- out_part.CopyFromMat(in);
- if (p != 1) {
- out_part.ApplyPow(p);
- if (higher_power_scale_ != 1.0)
- out_part.Scale(higher_power_scale_);
- }
- }
-}
-
-void PowerExpandComponent::Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &,// out_value,
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32, // num_chunks
- Component *, // to_update
- CuMatrix<BaseFloat> *in_deriv) const {
- in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kUndefined);
- CuMatrix<BaseFloat> temp(in_value.NumRows(), in_value.NumCols(), kUndefined);
- for (int32 p = 1; p <= max_power_; p++) {
- const CuSubMatrix<BaseFloat> out_deriv_part(out_deriv, 0, in_value.NumRows(),
- in_value.NumCols() * (p - 1),
- in_value.NumCols());
- if (p == 1) {
- in_deriv->CopyFromMat(out_deriv_part);
- } else {
- // in scalar terms: in_deriv += p * in_value^(p-1) * [out_deriv w.r.t. this power]
- temp.CopyFromMat(in_value);
- if (p > 2) temp.ApplyPow(p - 1);
- temp.MulElements(out_deriv_part);
- in_deriv->AddMat(p * higher_power_scale_, temp);
- }
- }
-}
-
-void PowerExpandComponent::Read(std::istream &is, bool binary) {
- ExpectOneOrTwoTokens(is, binary, "<PowerExpandComponent>", "<InputDim>");
- ReadBasicType(is, binary, &input_dim_);
- ExpectToken(is, binary, "<MaxPower>");
- ReadBasicType(is, binary, &max_power_);
- ExpectToken(is, binary, "<HigherPowerScale>");
- ReadBasicType(is, binary, &higher_power_scale_);
- ExpectToken(is, binary, "</PowerExpandComponent>");
-}
-
-void PowerExpandComponent::Write(std::ostream &os, bool binary) const {
- WriteToken(os, binary, "<PowerExpandComponent>");
- WriteToken(os, binary, "<InputDim>");
- WriteBasicType(os, binary, input_dim_);
- WriteToken(os, binary, "<MaxPower>");
- WriteBasicType(os, binary, max_power_);
- WriteToken(os, binary, "<HigherPowerScale>");
- WriteBasicType(os, binary, higher_power_scale_);
- WriteToken(os, binary, "</PowerExpandComponent>");
-}
-
-std::string PowerExpandComponent::Info() const {
- std::stringstream stream;
- stream << Type() << ", input-dim=" << input_dim_
- << ", max-power=" << max_power_;
- return stream.str();
-}
-
void NonlinearComponent::SetDim(int32 dim) {
KALDI_ASSERT(dim>0);
dim_ = dim;
return ans;
}
-
-void PiecewiseLinearComponent::Scale(BaseFloat scale) {
- params_.Scale(scale);
-}
-
-void PiecewiseLinearComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
- const PiecewiseLinearComponent *other =
- dynamic_cast<const PiecewiseLinearComponent*>(&other_in);
- KALDI_ASSERT(other != NULL);
- params_.AddMat(alpha, other->params_);
-}
-
-PiecewiseLinearComponent::PiecewiseLinearComponent(const PiecewiseLinearComponent &component):
- UpdatableComponent(component),
- params_(component.params_),
- is_gradient_(component.is_gradient_),
- max_change_(component.max_change_) { }
-
-void PiecewiseLinearComponent::SetZero(bool treat_as_gradient) {
- if (treat_as_gradient) {
- SetLearningRate(1.0);
- is_gradient_ = true;
- }
- params_.SetZero();
-}
-
-void PiecewiseLinearComponent::PerturbParams(BaseFloat stddev) {
- CuMatrix<BaseFloat> temp_params(params_);
- temp_params.SetRandn();
- params_.AddMat(stddev, temp_params);
-}
-
-std::string PiecewiseLinearComponent::Info() const {
- std::stringstream stream;
- BaseFloat params_size = static_cast<BaseFloat>(params_.NumRows())
- * static_cast<BaseFloat>(params_.NumCols());
- BaseFloat stddev =
- std::sqrt(TraceMatMat(params_, params_, kTrans) / params_size);
- CuVector<BaseFloat> per_dim_mean(params_.NumCols());
- CuVector<BaseFloat> per_dim_stddev(params_.NumCols());
- for (int32 dim = 0; dim < params_.NumCols(); dim++) {
- CuVector<BaseFloat> temp(params_.NumRows());
- temp.CopyColFromMat(params_, dim);
- BaseFloat mean = temp.Sum() / temp.Dim(),
- scatter = VecVec(temp, temp) / temp.Dim(),
- var = scatter - mean * mean,
- stddev = std::sqrt(var);
- per_dim_mean(dim) = mean;
- per_dim_stddev(dim) = stddev;
- }
- stream << Type() << ", input-dim=" << InputDim()
- << ", output-dim=" << OutputDim()
- << ", N=" << (params_.NumCols() - 2)
- << ", global-params-stddev=" << stddev
- << ", params-mean=" << per_dim_mean
- << ", params-stddev=" << per_dim_stddev
- << ", learning-rate=" << LearningRate()
- << ", max-change=" << max_change_;
- return stream.str();
-}
-
-Component* PiecewiseLinearComponent::Copy() const {
- PiecewiseLinearComponent *ans = new PiecewiseLinearComponent();
- ans->learning_rate_ = learning_rate_;
- ans->params_ = params_;
- ans->is_gradient_ = is_gradient_;
- ans->max_change_ = max_change_;
- return ans;
-}
-
-BaseFloat PiecewiseLinearComponent::DotProduct(const UpdatableComponent &other_in) const {
- const PiecewiseLinearComponent *other =
- dynamic_cast<const PiecewiseLinearComponent*>(&other_in);
- return TraceMatMat(params_, other->params_, kTrans);
-}
-
-void PiecewiseLinearComponent::Init(int32 dim, int32 N,
- BaseFloat learning_rate,
- BaseFloat max_change) {
- UpdatableComponent::Init(learning_rate);
- params_.Resize(dim, N + 2); // will set them to zero.
- KALDI_ASSERT(N >= 3 && N % 2 == 1 &&
- "PiecewiseLinearComponent: must have N >= 3 and odd.");
- for (int32 i = 0; i < dim; i++) {
- // The "middle" gamma index has c_i = 0. If we
- // initialize with all parameters zero except beta = 0.5 and the middle gamma
- // = 0.5, then we have f(x) = 0.5 x + 0.5 |x| = max(x, 0), which is the
- // same as the ReLU function.
- BaseFloat beta = 0.5, middle_gamma = 0.5;
- int32 middle_index = (N - 1) / 2;
- params_(i, 1) = beta;
- params_(i, middle_index + 2) = middle_gamma;
- }
- max_change_ = max_change;
-}
-
-void PiecewiseLinearComponent::InitFromString(std::string args) {
- std::string orig_args(args);
- bool ok = true;
- BaseFloat learning_rate = learning_rate_, max_change = 0.0;
- int32 dim = -1, N = 1;
- ParseFromString("learning-rate", &args, &learning_rate); // optional.
- ParseFromString("max-change", &args, &max_change); // optional.
- ok = ok && ParseFromString("dim", &args, &dim);
- ok = ok && ParseFromString("N", &args, &N);
-
- Init(dim, N, learning_rate, max_change);
-
- if (!args.empty())
- KALDI_ERR << "Could not process these elements in initializer: "
- << args;
- if (!ok)
- KALDI_ERR << "Bad initializer " << orig_args;
-}
-
-
-void PiecewiseLinearComponent::Propagate(const CuMatrixBase<BaseFloat> &in,
- int32, // num_chunks
- CuMatrix<BaseFloat> *out) const {
- out->Resize(in.NumRows(), OutputDim());
-
- KALDI_ASSERT(in.NumCols() == InputDim());
-
-
- CuVector<BaseFloat> temp(InputDim());
- int32 dim = OutputDim(), num_frames = in.NumRows(), N = params_.NumCols() - 2;
- BaseFloat tick = 2.0 / (N - 1);
- // "tick" is the distance between the c_i.
-
- for (int32 t = 0; t < num_frames; t++) {
- for (int32 d = 0; d < dim; d++) {
- BaseFloat x = in(t, d);
- BaseFloat alpha = params_(d, 0), beta = params_(d, 1);
- BaseFloat y = alpha + x * beta;
- for (int32 n = 0; n < N; n++) {
- BaseFloat c_n = -1.0 + tick * n, gamma_n = params_(d, n + 2);
- y += gamma_n * std::abs(x - c_n);
- }
- (*out)(t, d) = y;
- }
- }
-}
-
-
-void PiecewiseLinearComponent::Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &, // out_value
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32, // num_chunks
- Component *to_update_in,
- CuMatrix<BaseFloat> *in_deriv) const {
-
- PiecewiseLinearComponent *to_update =
- dynamic_cast<PiecewiseLinearComponent*>(to_update_in);
-
- KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
- in_value.NumCols() == InputDim());
-
- in_deriv->Resize(in_value.NumRows(), InputDim());
-
- int32 dim = OutputDim(), num_frames = in_value.NumRows(),
- N = params_.NumCols() - 2;
- BaseFloat tick = 2.0 / (N - 1);
- // "tick" is the distance between the c_i.
-
- CuMatrix<BaseFloat> param_deriv(params_.NumRows(), params_.NumCols());
-
- for (int32 t = 0; t < num_frames; t++) {
- for (int32 d = 0; d < dim; d++) {
- BaseFloat x = in_value(t, d), oderiv = out_deriv(t, d), ideriv = 0.0;
- BaseFloat beta = params_(d, 1);
- // in forward: y = alpha + x * beta.
- ideriv += beta * oderiv;
- param_deriv(d, 0) += 1.0 * oderiv;
- param_deriv(d, 1) += x * oderiv;
-
- for (int32 n = 0; n < N; n++) {
- BaseFloat c_n = -1.0 + tick * n, gamma_n = params_(d, n + 2);
- // in forward: y += gamma_n * std::abs(x - c_n);
- ideriv += oderiv * gamma_n * (x >= c_n ? 1.0 : -1.0);
- param_deriv(d, n + 2) += oderiv * std::abs(x - c_n);
- }
- (*in_deriv)(t, d) = ideriv;
- }
- }
- if (to_update != NULL) {
- if (to_update->is_gradient_ || to_update->max_change_ == 0.0) {
- to_update->params_.AddMat(to_update->learning_rate_, param_deriv);
- } else {
- param_deriv.Scale(to_update->learning_rate_);
- param_deriv.ApplyCeiling(to_update->max_change_);
- param_deriv.ApplyFloor(-to_update->max_change_);
- to_update->params_.AddMat(1.0, param_deriv);
- }
- }
-}
-
-void PiecewiseLinearComponent::Read(std::istream &is, bool binary) {
- std::ostringstream ostr_beg, ostr_end;
- ostr_beg << "<" << Type() << ">"; // e.g. "<PiecewiseLinearComponent>"
- ostr_end << "</" << Type() << ">"; // e.g. "</PiecewiseLinearComponent>"
- // might not see the "<PiecewiseLinearComponent>" part because
- // of how ReadNew() works.
- ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
- ReadBasicType(is, binary, &learning_rate_);
- ExpectToken(is, binary, "<Params>");
- params_.Read(is, binary);
- ExpectToken(is, binary, "<IsGradient>");
- ReadBasicType(is, binary, &is_gradient_);
- ExpectToken(is, binary, "<MaxChange>");
- ReadBasicType(is, binary, &max_change_);
- ExpectToken(is, binary, ostr_end.str());
-}
-
-
-void PiecewiseLinearComponent::Write(std::ostream &os, bool binary) const {
- std::ostringstream ostr_beg, ostr_end;
- ostr_beg << "<" << Type() << ">"; // e.g. "<PiecewiseLinearComponent>"
- ostr_end << "</" << Type() << ">"; // e.g. "</PiecewiseLinearComponent>"
- WriteToken(os, binary, ostr_beg.str());
- WriteToken(os, binary, "<LearningRate>");
- WriteBasicType(os, binary, learning_rate_);
- WriteToken(os, binary, "<Params>");
- params_.Write(os, binary);
- WriteToken(os, binary, "<IsGradient>");
- WriteBasicType(os, binary, is_gradient_);
- WriteToken(os, binary, "<MaxChange>");
- WriteBasicType(os, binary, max_change_);
- WriteToken(os, binary, ostr_end.str());
-}
-
-int32 PiecewiseLinearComponent::GetParameterDim() const {
- return params_.NumRows() * params_.NumCols();
-}
-
-void PiecewiseLinearComponent::Vectorize(VectorBase<BaseFloat> *params) const {
- params->CopyRowsFromMat(params_);
-}
-void PiecewiseLinearComponent::UnVectorize(const VectorBase<BaseFloat> ¶ms) {
- params_.CopyRowsFromVec(params);
-}
-
-
void AffineComponentPreconditioned::Read(std::istream &is, bool binary) {
std::ostringstream ostr_beg, ostr_end;
ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponentPreconditioned>"
in_value_precon_part, kNoTrans, 1.0);
}
-
-void AffineComponentModified::Read(std::istream &is, bool binary) {
- std::ostringstream ostr_beg, ostr_end;
- ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
- ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
- // might not see the "<AffineComponent>" part because
- // of how ReadNew() works.
- ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
- ReadBasicType(is, binary, &learning_rate_);
- ExpectToken(is, binary, "<LinearParams>");
- linear_params_.Read(is, binary);
- ExpectToken(is, binary, "<BiasParams>");
- bias_params_.Read(is, binary);
- ExpectToken(is, binary, "<CutoffLength>");
- ReadBasicType(is, binary, &cutoff_length_);
- ExpectToken(is, binary, "<MaxChange>");
- ReadBasicType(is, binary, &max_change_);
- ExpectToken(is, binary, ostr_end.str());
+void BlockAffineComponent::SetZero(bool treat_as_gradient) {
+ if (treat_as_gradient) {
+ SetLearningRate(1.0);
+ }
+ linear_params_.SetZero();
+ bias_params_.SetZero();
}
+void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
+ CuMatrix<BaseFloat> temp_linear_params(linear_params_);
+ temp_linear_params.SetRandn();
+ linear_params_.AddMat(stddev, temp_linear_params);
+
+ CuVector<BaseFloat> temp_bias_params(bias_params_);
+ temp_bias_params.SetRandn();
+ bias_params_.AddVec(stddev, temp_bias_params);
+}
-void AffineComponentModified::InitFromString(std::string args) {
- std::string orig_args(args);
- bool ok = true;
- std::string matrix_filename;
- BaseFloat learning_rate = learning_rate_;
- BaseFloat cutoff_length = 0.25, max_change = 0.1;
- int32 input_dim = -1, output_dim = -1;
- ParseFromString("learning-rate", &args, &learning_rate); // optional.
- ParseFromString("cutoff-length", &args, &cutoff_length);
- ParseFromString("max-change", &args, &max_change);
+BaseFloat BlockAffineComponent::DotProduct(
+ const UpdatableComponent &other_in) const {
+ const BlockAffineComponent *other =
+ dynamic_cast<const BlockAffineComponent*>(&other_in);
+ return TraceMatMat(linear_params_, other->linear_params_, kTrans)
+ + VecVec(bias_params_, other->bias_params_);
+}
- if (ParseFromString("matrix", &args, &matrix_filename)) {
- Init(learning_rate, cutoff_length, max_change, matrix_filename);
- if (ParseFromString("input-dim", &args, &input_dim))
- KALDI_ASSERT(input_dim == InputDim() &&
- "input-dim mismatch vs. matrix.");
- if (ParseFromString("output-dim", &args, &output_dim))
- KALDI_ASSERT(output_dim == OutputDim() &&
- "output-dim mismatch vs. matrix.");
- } else {
- ok = ok && ParseFromString("input-dim", &args, &input_dim);
- ok = ok && ParseFromString("output-dim", &args, &output_dim);
- BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
- bias_stddev = 0.0;
- ParseFromString("param-stddev", &args, ¶m_stddev);
- ParseFromString("bias-stddev", &args, &bias_stddev);
- Init(learning_rate, input_dim, output_dim, param_stddev,
- bias_stddev, cutoff_length, max_change);
- }
- if (!args.empty())
- KALDI_ERR << "Could not process these elements in initializer: "
- << args;
- if (!ok)
- KALDI_ERR << "Bad initializer " << orig_args;
+Component* BlockAffineComponent::Copy() const {
+ BlockAffineComponent *ans = new BlockAffineComponent();
+ ans->learning_rate_ = learning_rate_;
+ ans->linear_params_ = linear_params_;
+ ans->bias_params_ = bias_params_;
+ ans->num_blocks_ = num_blocks_;
+ return ans;
}
-void AffineComponentModified::Init(BaseFloat learning_rate, BaseFloat length_cutoff,
- BaseFloat max_change, std::string matrix_filename) {
- UpdatableComponent::Init(learning_rate);
- cutoff_length_ = cutoff_length_;
- max_change_ = max_change;
- CuMatrix<BaseFloat> mat;
- ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
- KALDI_ASSERT(mat.NumCols() >= 2);
- int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows();
- linear_params_.Resize(output_dim, input_dim);
- bias_params_.Resize(output_dim);
- linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
- bias_params_.CopyColFromMat(mat, input_dim);
+void BlockAffineComponent::Scale(BaseFloat scale) {
+ linear_params_.Scale(scale);
+ bias_params_.Scale(scale);
}
-void AffineComponentModified::Init(
- BaseFloat learning_rate,
- int32 input_dim, int32 output_dim,
- BaseFloat param_stddev, BaseFloat bias_stddev,
- BaseFloat cutoff_length, BaseFloat max_change) {
- UpdatableComponent::Init(learning_rate);
- linear_params_.Resize(output_dim, input_dim);
- bias_params_.Resize(output_dim);
- KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
- linear_params_.SetRandn(); // sets to random normally distributed noise.
- linear_params_.Scale(param_stddev);
- bias_params_.SetRandn();
- bias_params_.Scale(bias_stddev);
- cutoff_length_ = cutoff_length;
- KALDI_ASSERT(max_change_ > 0.0);
- max_change_ = max_change; // Note: any value of max_change_is valid, but
- // only values > 0.0 will actually activate the code.
+void BlockAffineComponent::Add(BaseFloat alpha,
+ const UpdatableComponent &other_in) {
+ const BlockAffineComponent *other =
+ dynamic_cast<const BlockAffineComponent*>(&other_in);
+ KALDI_ASSERT(other != NULL);
+ linear_params_.AddMat(alpha, other->linear_params_);
+ bias_params_.AddVec(alpha, other->bias_params_);
}
-
-void AffineComponentModified::Write(std::ostream &os, bool binary) const {
- std::ostringstream ostr_beg, ostr_end;
- ostr_beg << "<" << Type() << ">"; // e.g. "<AffineComponent>"
- ostr_end << "</" << Type() << ">"; // e.g. "</AffineComponent>"
- WriteToken(os, binary, ostr_beg.str());
- WriteToken(os, binary, "<LearningRate>");
- WriteBasicType(os, binary, learning_rate_);
- WriteToken(os, binary, "<LinearParams>");
- linear_params_.Write(os, binary);
- WriteToken(os, binary, "<BiasParams>");
- bias_params_.Write(os, binary);
- WriteToken(os, binary, "<CutoffLength>");
- WriteBasicType(os, binary, cutoff_length_);
- WriteToken(os, binary, "<MaxChange>");
- WriteBasicType(os, binary, max_change_);
- WriteToken(os, binary, ostr_end.str());
-}
-
-std::string AffineComponentModified::Info() const {
- std::stringstream stream;
- BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
- * static_cast<BaseFloat>(linear_params_.NumCols());
- BaseFloat linear_stddev =
- std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
- linear_params_size),
- bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
- bias_params_.Dim());
- stream << Type() << ", input-dim=" << InputDim()
- << ", output-dim=" << OutputDim()
- << ", linear-params-stddev=" << linear_stddev
- << ", bias-params-stddev=" << bias_stddev
- << ", learning-rate=" << LearningRate()
- << ", cutoff_length=" << cutoff_length_
- << ", max-change=" << max_change_;
- return stream.str();
-}
-
-Component* AffineComponentModified::Copy() const {
- AffineComponentModified *ans = new AffineComponentModified();
- ans->learning_rate_ = learning_rate_;
- ans->linear_params_ = linear_params_;
- ans->bias_params_ = bias_params_;
- ans->cutoff_length_ = cutoff_length_;
- ans->max_change_ = max_change_;
- ans->is_gradient_ = is_gradient_;
- return ans;
-}
-
-void AffineComponentModified::Update(
- const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_deriv) {
-
- int32 output_dim = OutputDim(), input_dim = InputDim();
-
- CuMatrix<BaseFloat> delta_params(output_dim, input_dim + 1);
-
- { // set delta_params to the change in parameters under a
- // straightforward gradient descent.
- CuSubMatrix<BaseFloat> linear_delta_params(delta_params,
- 0, output_dim,
- 0, input_dim);
- linear_delta_params.AddMatMat(learning_rate_, out_deriv, kTrans,
- in_value, kNoTrans, 0.0);
-
- CuVector<BaseFloat> bias_delta_params(output_dim);
- bias_delta_params.AddRowSumMat(learning_rate_, out_deriv);
-
- delta_params.CopyColFromVec(bias_delta_params, input_dim);
- }
-
- // diagnostics:
- int32 num_below_cutoff = 0, num_below_cutoff_limited = 0,
- num_limited = 0;
-
- CuVector<BaseFloat> param_row(input_dim + 1);
- for (int32 d = 0; d < output_dim; d++) {
- CuSubVector<BaseFloat> delta_param_row(delta_params, d);
- // Get the corresponding row of current parameters:
- param_row.Range(0, input_dim).CopyFromVec(linear_params_.Row(d));
- param_row(input_dim) = bias_params_(d);
-
- BaseFloat length = sqrt(VecVec(param_row, param_row)),
- delta_length = sqrt(VecVec(delta_param_row, delta_param_row)),
- dot_product = VecVec(param_row, delta_param_row);
- if (length < cutoff_length_) {
- // length is below cutoff -> do normal gradient descent, except to prevent
- // very large changes, limit delta to cutoff_length_ times max_change_.
- num_below_cutoff++;
- if (delta_length > cutoff_length_ * max_change_) {
- delta_param_row.Scale((cutoff_length_ * max_change_) / delta_length);
- num_below_cutoff_limited++;
- }
- } else {
- BaseFloat scale = 1.0; // We'll later scale delta_param_row by this much.
- // First enforce that the length of delta_param_row cannot exceed the current
- // length of the row times max_change_.
- if (delta_length > length * max_change_) {
- scale = (length * max_change_) / delta_length;
- delta_length *= scale;
- dot_product *= scale;
- num_limited++;
- }
- // OK, now rescale the (param_row + delta_param_row)
- // such that its length equals the original length of param_row plus
- // the component of delta_param_row in the direction of param_row.
- BaseFloat delta_length_in_direction = dot_product / length,
- delta_length_perpendicular_sq =
- delta_length * delta_length -
- delta_length_in_direction * delta_length_in_direction;
- KALDI_ASSERT(delta_length_perpendicular_sq >= 0.0);
- // delta_length_in_direction equals the (signed) length of the component
- // of delta_param_row in the same direction as "param_row",
- // delta_length_perpendicular_sq is the squared length of the component
- // perpendicular to that.
- BaseFloat new_length = length + delta_length_in_direction;
- // "new_length" is the length that we want the sum (param_row +
- // delta_param_row) to be, but we will need to rescale to ensure this.
- BaseFloat actual_length = sqrt(new_length * new_length +
- delta_length_perpendicular_sq);
- BaseFloat scaling_factor = new_length / actual_length;
- // We want to scale (param_row + delta_param_row) by "scaling_factor",
- // and express the result as an offset from param_row so we can add to it:
- // we want scaling_factor * (param_row + delta_param_row) - param_row
- // which equals param_row * (scaling_factor-1) + scaling_factor * delta_param_row.
-
- delta_param_row.Scale(scale * scaling_factor); // The "scale" comes from a previous
- // length-limiting operation, we delayed its application until now for efficiency.
- delta_param_row.AddVec(scaling_factor - 1.0, param_row);
- }
- // Now apply the change.
- linear_params_.Row(d).AddVec(1.0, delta_param_row.Range(0, input_dim));
- bias_params_(d) += delta_param_row(input_dim);
- }
- static int32 num_messages_printed = 0;
- if (num_messages_printed < 100) {
- KALDI_LOG << "Processed " << output_dim << " parameter rows, of which "
- << num_below_cutoff << " were below length cutoff (of which "
- << num_below_cutoff_limited << " were limited); of the rest, "
- << num_limited << " had their length limited.";
- num_messages_printed++;
- }
-}
-
-
-
-void AffinePreconInputComponent::SetZero(bool treat_as_gradient) {
- if (treat_as_gradient) {
- SetLearningRate(1.0);
- is_gradient_ = true;
- }
- linear_params_.SetZero();
- bias_params_.SetZero();
-}
-
-void AffinePreconInputComponent::Backprop(
- const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &, // out_value
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32, // num_chunks
- Component *to_update_in,
- CuMatrix<BaseFloat> *in_deriv) const {
- AffinePreconInputComponent *to_update =
- dynamic_cast<AffinePreconInputComponent*>(to_update_in);
- in_deriv->Resize(out_deriv.NumRows(), InputDim());
- // Propagate the derivative back to the input.
- in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans,
- 0.0);
-
- if (to_update != NULL) {
- // Next update the model (must do this 2nd so the derivatives we propagate
- // are accurate, in case this == to_update_in.)
- // add the sum of the rows of out_deriv, to the bias_params_.
- to_update->bias_params_.AddRowSumMat(to_update->learning_rate_, out_deriv,
- 1.0);
- if (to_update->is_gradient_) { // simple update, getting gradient.
- to_update->linear_params_.AddMatMat(to_update->learning_rate_,
- out_deriv, kTrans,
- in_value, kNoTrans,
- 1.0);
- } else {
- // more complex update, correcting for variance of input features. Note:
- // most likely to_update == this, but we don't insist on this.
- CuMatrix<BaseFloat> in_value_tmp(in_value);
- in_value_tmp.MulColsVec(input_precision_); // Scale each column of in_value_tmp
- // (i.e. each dimension of the input features) by the corresponding element of
- // input_precision_.
-
- to_update->linear_params_.AddMatMat(to_update->learning_rate_,
- out_deriv, kTrans, in_value_tmp,
- kNoTrans, 1.0);
- // Next update input_precision_. Note: we don't use any scaling on the
- // samples at this point. This really won't matter in practice, it's just
- // for preconditioning. Note: avg_samples_ is not very precisely a number
- // of samples to average over, just in an approximate dimensional sense; the
- // inverse of this is the constant in the exponential averaging. Note: the
- // least we can actually average over is one minibatch; this is where the
- // std::max comes in below.
- int32 num_frames = in_value_tmp.NumRows();
- BaseFloat avg_samples_scaled =
- std::max(1.0, static_cast<double>(avg_samples_ / num_frames));
- BaseFloat cur_scale = 1.0 / avg_samples_scaled,
- prev_scale = 1.0 - cur_scale;
- CuVector<BaseFloat> &input_precision = to_update->input_precision_;
- input_precision.InvertElements();
- input_precision.AddDiagMat2(cur_scale, in_value, kTrans, prev_scale);
- if (input_precision.ApplyFloor(1.0e-10) > 0)
- KALDI_WARN << "Flooring elements of input feature variance.";
- input_precision.InvertElements();
- }
- }
-}
-
-void AffinePreconInputComponent::Read(std::istream &is, bool binary) {
- ExpectOneOrTwoTokens(is, binary, "<AffinePreconInputComponent>", "<LearningRate>");
- ReadBasicType(is, binary, &learning_rate_);
- ExpectToken(is, binary, "<AvgSamples>");
- ReadBasicType(is, binary, &avg_samples_);
- ExpectToken(is, binary, "<IsGradient>");
- ReadBasicType(is, binary, &is_gradient_);
- ExpectToken(is, binary, "<LinearParams>");
- linear_params_.Read(is, binary);
- ExpectToken(is, binary, "<BiasParams>");
- bias_params_.Read(is, binary);
- ExpectToken(is, binary, "<InputPrecision>");
- input_precision_.Read(is, binary);
- ExpectToken(is, binary, "</AffinePreconInputComponent>");
-}
-
-void AffinePreconInputComponent::Write(std::ostream &os, bool binary) const {
- WriteToken(os, binary, "<AffinePreconInputComponent>");
- WriteToken(os, binary, "<LearningRate>");
- WriteBasicType(os, binary, learning_rate_);
- WriteToken(os, binary, "<AvgSamples>");
- WriteBasicType(os, binary, avg_samples_);
- WriteToken(os, binary, "<IsGradient>");
- WriteBasicType(os, binary, is_gradient_);
- WriteToken(os, binary, "<LinearParams>");
- linear_params_.Write(os, binary);
- WriteToken(os, binary, "<BiasParams>");
- bias_params_.Write(os, binary);
- WriteToken(os, binary, "<InputPrecision>");
- input_precision_.Write(os, binary);
- WriteToken(os, binary, "</AffinePreconInputComponent>");
-}
-
-void AffinePreconInputComponent::Init(
- BaseFloat learning_rate,
- int32 input_dim, int32 output_dim,
- BaseFloat param_stddev,
- BaseFloat bias_stddev,
- BaseFloat avg_samples) {
- is_gradient_ = false;
- UpdatableComponent::Init(learning_rate);
- linear_params_.Resize(output_dim, input_dim);
- bias_params_.Resize(output_dim);
- KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
- linear_params_.SetRandn(); // sets to random normally distributed noise.
- linear_params_.Scale(param_stddev);
- bias_params_.SetRandn();
- bias_params_.Scale(bias_stddev);
- avg_samples_ = avg_samples;
- KALDI_ASSERT(avg_samples_ > 1.0);
- input_precision_.Resize(input_dim);
- input_precision_.Set(1.0); // Set to all ones, as initially we
- // have no idea what the parameter variance is.
-}
-
-void AffinePreconInputComponent::InitFromString(std::string args) {
- std::string orig_args(args);
- bool ok = true;
- BaseFloat learning_rate = learning_rate_,
- avg_samples = 2000.0;
- int32 input_dim = -1, output_dim = -1;
- ParseFromString("learning-rate", &args, &learning_rate); // optional.
- ParseFromString("avg-samples", &args, &avg_samples); // optional.
- ok = ok && ParseFromString("input-dim", &args, &input_dim);
- ok = ok && ParseFromString("output-dim", &args, &output_dim);
- BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
- bias_stddev = 1.0;
- ParseFromString("param-stddev", &args, ¶m_stddev);
- ParseFromString("bias-stddev", &args, &bias_stddev);
- if (!args.empty())
- KALDI_ERR << "Could not process these elements in initializer: "
- << args;
- if (!ok)
- KALDI_ERR << "Bad initializer " << orig_args;
- Init(learning_rate, input_dim, output_dim,
- param_stddev, bias_stddev, avg_samples);
-}
-
-Component* AffinePreconInputComponent::Copy() const {
- AffinePreconInputComponent *ans = new AffinePreconInputComponent();
- ans->learning_rate_ = learning_rate_;
- ans->avg_samples_ = avg_samples_;
- ans->linear_params_ = linear_params_;
- ans->bias_params_ = bias_params_;
- ans->input_precision_ = input_precision_;
- return ans;
-}
-
-void BlockAffineComponent::SetZero(bool treat_as_gradient) {
- if (treat_as_gradient) {
- SetLearningRate(1.0);
- }
- linear_params_.SetZero();
- bias_params_.SetZero();
-}
-
-void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
- CuMatrix<BaseFloat> temp_linear_params(linear_params_);
- temp_linear_params.SetRandn();
- linear_params_.AddMat(stddev, temp_linear_params);
-
- CuVector<BaseFloat> temp_bias_params(bias_params_);
- temp_bias_params.SetRandn();
- bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-BaseFloat BlockAffineComponent::DotProduct(
- const UpdatableComponent &other_in) const {
- const BlockAffineComponent *other =
- dynamic_cast<const BlockAffineComponent*>(&other_in);
- return TraceMatMat(linear_params_, other->linear_params_, kTrans)
- + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* BlockAffineComponent::Copy() const {
- BlockAffineComponent *ans = new BlockAffineComponent();
- ans->learning_rate_ = learning_rate_;
- ans->linear_params_ = linear_params_;
- ans->bias_params_ = bias_params_;
- ans->num_blocks_ = num_blocks_;
- return ans;
-}
-
-void BlockAffineComponent::Scale(BaseFloat scale) {
- linear_params_.Scale(scale);
- bias_params_.Scale(scale);
-}
-
-void BlockAffineComponent::Add(BaseFloat alpha,
- const UpdatableComponent &other_in) {
- const BlockAffineComponent *other =
- dynamic_cast<const BlockAffineComponent*>(&other_in);
- KALDI_ASSERT(other != NULL);
- linear_params_.AddMat(alpha, other->linear_params_);
- bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-void BlockAffineComponent::Propagate(const CuMatrixBase<BaseFloat> &in,
- int32, // num_chunks
- CuMatrix<BaseFloat> *out) const {
- out->Resize(in.NumRows(), bias_params_.Dim());
+void BlockAffineComponent::Propagate(const CuMatrixBase<BaseFloat> &in,
+ int32, // num_chunks
+ CuMatrix<BaseFloat> *out) const {
+ out->Resize(in.NumRows(), bias_params_.Dim());
// The matrix has a block structure where each matrix has input dim
// (#rows) equal to input_block_dim. The blocks are stored in linear_params_
in_deriv->CopyCols(out_deriv, reorder_);
}
-void MixtureProbComponent::Refresh() {
- KALDI_ASSERT(params_.size() == log_params_.size());
- for (size_t i = 0; i < params_.size(); i++) {
- // Make it so each column of params_ sums to one
- CuVector<BaseFloat> col(params_[i].NumRows());
- for (int32 c = 0; c < params_[i].NumCols(); c++) {
- col.CopyColFromMat(log_params_[i], c);
- col.ApplyExp();
- KALDI_ASSERT(col.Sum() > 0.0);
- col.Scale(1.0 / col.Sum()); // make it sum to one.
- params_[i].CopyColFromVec(col, c);
- }
- }
-}
-
-void MixtureProbComponent::PerturbParams(BaseFloat stddev) {
- for (size_t i = 0; i < log_params_.size(); i++) {
- CuMatrix<BaseFloat> &log_params(log_params_[i]);
- CuMatrix<BaseFloat> rand(log_params.NumRows(), log_params.NumCols());
- rand.SetRandn();
- log_params.AddMat(stddev, rand);
- }
- Refresh();
-}
-
-
-Component* MixtureProbComponent::Copy() const {
- MixtureProbComponent *ans = new MixtureProbComponent();
- ans->learning_rate_ = learning_rate_;
- ans->log_params_ = log_params_;
- ans->params_ = params_;
- ans->input_dim_ = input_dim_;
- ans->output_dim_ = output_dim_;
- return ans;
-}
-
-BaseFloat MixtureProbComponent::DotProduct(
- const UpdatableComponent &other_in) const {
- const MixtureProbComponent *other =
- dynamic_cast<const MixtureProbComponent*>(&other_in);
- BaseFloat ans = 0.0;
- KALDI_ASSERT(log_params_.size() == other->log_params_.size());
-
- for (size_t i = 0; i < params_.size(); i++) {
- const CuMatrix<BaseFloat> &log_params(log_params_[i]),
- &other_log_params(other->log_params_[i]);
- ans += TraceMatMat(log_params, other_log_params, kTrans);
- }
- return ans;
-}
-
-void MixtureProbComponent::Scale(BaseFloat scale) {
- for (size_t i = 0; i < params_.size(); i++) {
- CuMatrix<BaseFloat> &log_params(log_params_[i]);
- log_params.Scale(scale);
- }
- Refresh();
-}
-
-void MixtureProbComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
- const MixtureProbComponent *other =
- dynamic_cast<const MixtureProbComponent*>(&other_in);
- KALDI_ASSERT(other != NULL && other->params_.size() == params_.size());
-
- for (size_t i = 0; i < params_.size(); i++) {
- CuMatrix<BaseFloat> log_params(log_params_[i]),
- other_log_params(other->log_params_[i]);
- log_params.AddMat(alpha, other_log_params); // <- This is the key line.
- }
- Refresh();
-}
-
-
-void MixtureProbComponent::Init(BaseFloat learning_rate,
- BaseFloat diag_element,
- const std::vector<int32> &sizes) {
- UpdatableComponent::Init(learning_rate);
- input_dim_ = 0;
- output_dim_ = 0;
- params_.resize(sizes.size());
- log_params_.resize(sizes.size());
- KALDI_ASSERT(diag_element > 0.0 && diag_element < 1.0);
- // Initialize to a block-diagonal matrix consisting of a series of square
- // blocks, with sizes specified in "sizes". Note: each block will typically
- // correspond to a number of clustered states, so this whole thing implements
- // an idea similar to the "state clustered tied mixture" system.
- for (size_t i = 0; i < sizes.size(); i++) {
- KALDI_ASSERT(sizes[i] > 0);
- int32 size = sizes[i];
- params_[i].Resize(size, size);
- input_dim_ += size;
- output_dim_ += size;
- if (size == 1) {
- params_[i](0,0) = 1.0;
- } else {
- BaseFloat off_diag_element = (1.0 - diag_element) / (size - 0.999999);
- params_[i].Set(off_diag_element);
- for (int32 j = 0; j < size; j++)
- params_[i](j, j) = diag_element;
- }
- log_params_[i] = params_[i];
- log_params_[i].ApplyLog(); // From now, log_params_ will be the
- // "primary" parameters, with params_ treated as derived quantities.
- }
-}
-
-// e.g. args="learning-rate=0.01 diag-element=0.9 dims=3:4:5"
-void MixtureProbComponent::InitFromString(std::string args) {
- std::string orig_args(args);
- bool ok = true;
- BaseFloat learning_rate = learning_rate_,
- diag_element = 0.9;
- std::vector<int32> dims;
- ParseFromString("learning-rate", &args, &learning_rate); // optional.
- ParseFromString("diag-element", &args, &diag_element); // optional.
- ok = ok && ParseFromString("dims", &args, &dims); // dims is colon-separated list.
- if (!args.empty())
- KALDI_ERR << "Could not process these elements in initializer: "
- << args;
- if (!ok)
- KALDI_ERR << "Bad initializer " << orig_args;
- Init(learning_rate, diag_element, dims);
-}
-
-// For back-compatibility, we read and write the "params".
-void MixtureProbComponent::Read(std::istream &is, bool binary) {
- ExpectOneOrTwoTokens(is, binary, "<MixtureProbComponent>", "<LearningRate>");
- ReadBasicType(is, binary, &learning_rate_);
- ExpectToken(is, binary, "<Params>");
- int32 size;
- ReadBasicType(is, binary, &size);
- input_dim_ = 0;
- output_dim_ = 0;
- KALDI_ASSERT(size >= 0);
- params_.resize(size);
- log_params_.resize(size);
- for (int32 i = 0; i < size; i++) {
- params_[i].Read(is, binary);
- input_dim_ += params_[i].NumCols();
- output_dim_ += params_[i].NumRows();
- log_params_[i] = params_[i];
- log_params_[i].ApplyLog();
- }
-
-#if 0 // this is back-compatibility code, now disabled. Will remove eventually.
- std::string token;
- ReadToken(is, binary, &token);
- if (token == "<IsGradient>") { // Back-compatibility code,
- // remove this later.
- bool tmp;
- ReadBasicType(is, binary, &tmp);
- ExpectToken(is, binary, "</MixtureProbComponent>");
- } else {
- KALDI_ASSERT(token == "</MixtureProbComponent>");
- }
-#else
- ExpectToken(is, binary, "</MixtureProbComponent>");
-#endif
-
-}
-
-void MixtureProbComponent::Write(std::ostream &os, bool binary) const {
- WriteToken(os, binary, "<MixtureProbComponent>");
- WriteToken(os, binary, "<LearningRate>");
- WriteBasicType(os, binary, learning_rate_);
- WriteToken(os, binary, "<Params>");
- int32 size = params_.size();
- WriteBasicType(os, binary, size);
- for (int32 i = 0; i < size; i++)
- params_[i].Write(os, binary);
- WriteToken(os, binary, "</MixtureProbComponent>");
-}
-
-void MixtureProbComponent::SetZero(bool treat_as_gradient) {
- if (treat_as_gradient) {
- SetLearningRate(1.0);
- }
- for (size_t i = 0; i < params_.size(); i++)
- log_params_[i].SetZero();
- Refresh();
-}
-
-void MixtureProbComponent::Propagate(const CuMatrixBase<BaseFloat> &in,
- int32, // num_chunks
- CuMatrix<BaseFloat> *out) const {
- KALDI_ASSERT(in.NumCols() == InputDim());
- out->Resize(in.NumRows(), OutputDim());
-
- int32 num_frames = in.NumRows(),
- input_offset = 0,
- output_offset = 0;
-
- for (size_t i = 0; i < params_.size(); i++) {
- int32 this_input_dim = params_[i].NumCols(), // input dim of this block.
- this_output_dim = params_[i].NumRows();
- KALDI_ASSERT(this_input_dim > 0 && this_output_dim > 0);
- CuSubMatrix<BaseFloat> in_block(in, 0, num_frames,
- input_offset, this_input_dim),
- out_block(*out, 0, num_frames, output_offset, this_output_dim);
- const CuMatrix<BaseFloat> ¶m_block(params_[i]);
- out_block.AddMatMat(1.0, in_block, kNoTrans, param_block, kTrans, 0.0);
- input_offset += this_input_dim;
- output_offset += this_output_dim;
- }
- KALDI_ASSERT(input_offset == InputDim() && output_offset == OutputDim());
-}
-
-void MixtureProbComponent::Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &,// out_value
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32, // num_chunks
- Component *to_update_in,
- CuMatrix<BaseFloat> *in_deriv) const {
- MixtureProbComponent *to_update = dynamic_cast<MixtureProbComponent*>(
- to_update_in);
-
- in_deriv->Resize(out_deriv.NumRows(), InputDim());
- KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
- in_value.NumCols() == InputDim() && out_deriv.NumCols() == OutputDim());
- int32 num_frames = in_value.NumRows(),
- input_offset = 0,
- output_offset = 0;
-
- for (size_t i = 0; i < params_.size(); i++) {
- int32 this_input_dim = params_[i].NumCols(), // input dim of this block.
- this_output_dim = params_[i].NumRows();
- KALDI_ASSERT(this_input_dim > 0 && this_output_dim > 0);
- CuSubMatrix<BaseFloat> in_value_block(in_value, 0, num_frames,
- input_offset, this_input_dim),
- in_deriv_block(*in_deriv, 0, num_frames,
- input_offset, this_input_dim),
- out_deriv_block(out_deriv, 0, num_frames,
- output_offset, this_output_dim);
- const CuMatrix<BaseFloat> ¶m_block(params_[i]);
-
- // Propagate gradient back to in_deriv.
- in_deriv_block.AddMatMat(1.0, out_deriv_block, kNoTrans, param_block,
- kNoTrans, 0.0);
-
- if (to_update != NULL) {
- CuMatrix<BaseFloat> &log_param_block_to_update(to_update->log_params_[i]);
- const CuMatrix<BaseFloat> ¶m_block(this->params_[i]);
-
- int32 num_rows = this_output_dim, num_cols = this_input_dim;
-
- CuMatrix<BaseFloat> gradient(num_rows, num_cols); // gradient
- // in space of derived params "params_".
- gradient.AddMatMat(1.0, out_deriv_block,
- kTrans, in_value_block, kNoTrans,
- 0.0);
-
- CuVector<BaseFloat> param_col(num_rows),
- gradient_col(num_rows),
- log_gradient_col(num_rows),
- log_param_col(num_rows);
- for (int32 col = 0; col < num_cols; col++) {
- param_col.CopyColFromMat(param_block, col);
- gradient_col.CopyColFromMat(gradient, col);
- BaseFloat cT_g = VecVec(param_col, gradient_col);
-
- log_gradient_col.AddVecVec(1.0, param_col, gradient_col, 0.0); // h <-- diag(c) g.
- log_gradient_col.AddVec(-cT_g, param_col); // h -= (c^T g) c . This is the
- // effect on the derivative of the sum-to-one constraint.
- log_param_col.CopyColFromMat(log_param_block_to_update, col);
- log_param_col.AddVec(to_update->learning_rate_,
- log_gradient_col);
- // Gradient step in unnormalized log-prob space.
- log_param_block_to_update.CopyColFromVec(log_param_col, col); // Write back.
- }
- }
- input_offset += this_input_dim;
- output_offset += this_output_dim;
- }
- if (to_update != NULL)
- to_update->Refresh();
- KALDI_ASSERT(input_offset == InputDim() && output_offset == OutputDim());
-}
-
-int32 MixtureProbComponent::GetParameterDim() const {
- int32 ans = 0;
- for (size_t i = 0; i < params_.size(); i++)
- ans += params_[i].NumRows() * params_[i].NumCols();
- return ans;
-}
-
-void MixtureProbComponent::Vectorize(VectorBase<BaseFloat> *params) const {
- int32 offset = 0;
- for (size_t i = 0; i < params_.size(); i++) {
- int32 size = params_[i].NumRows() * params_[i].NumCols();
- params->Range(offset, size).CopyRowsFromMat(params_[i]);
- offset += size;
- }
- KALDI_ASSERT(offset == params->Dim());
-}
-
-void MixtureProbComponent::UnVectorize(const VectorBase<BaseFloat> ¶ms) {
- int32 offset = 0;
- for (size_t i = 0; i < params_.size(); i++) {
- int32 size = params_[i].NumRows() * params_[i].NumCols();
- params_[i].CopyRowsFromVec(params.Range(offset, size));
- offset += size;
- }
- KALDI_ASSERT(offset == params.Dim());
-}
-
void SumGroupComponent::Init(const std::vector<int32> &sizes) {
KALDI_ASSERT(!sizes.empty());
std::vector<Int32Pair> cpu_vec(sizes.size());
out->AddMat(stddev_, rand);
}
-void AffineComponentA::Read(std::istream &is, bool binary) {
- ExpectOneOrTwoTokens(is, binary, "<AffineComponentA>", "<LearningRate>");
- ReadBasicType(is, binary, &learning_rate_);
- ExpectToken(is, binary, "<LinearParams>");
- linear_params_.Read(is, binary);
- ExpectToken(is, binary, "<BiasParams>");
- bias_params_.Read(is, binary);
- ExpectToken(is, binary, "<IsGradient>");
- ReadBasicType(is, binary, &is_gradient_);
- ExpectToken(is, binary, "<InputScatter>");
- input_scatter_.Read(is, binary);
- ExpectToken(is, binary, "<OutputScatter>");
- output_scatter_.Read(is, binary);
- ExpectToken(is, binary, "<InC>");
- in_C_.Read(is, binary);
- ExpectToken(is, binary, "<InCInv>");
- in_C_inv_.Read(is, binary);
- ExpectToken(is, binary, "<OutC>");
- out_C_.Read(is, binary);
- ExpectToken(is, binary, "<OutCInv>");
- out_C_inv_.Read(is, binary);
- ExpectToken(is, binary, "<InvFisherIn>");
- inv_fisher_in_.Read(is, binary);
- ExpectToken(is, binary, "<InvFisherOut>");
- inv_fisher_out_.Read(is, binary);
- ExpectToken(is, binary, "</AffineComponentA>");
-}
-
-void AffineComponentA::Write(std::ostream &os, bool binary) const {
- WriteToken(os, binary, "<AffineComponentA>");
- WriteToken(os, binary, "<LearningRate>");
- WriteBasicType(os, binary, learning_rate_);
- WriteToken(os, binary, "<LinearParams>");
- linear_params_.Write(os, binary);
- WriteToken(os, binary, "<BiasParams>");
- bias_params_.Write(os, binary);
- WriteToken(os, binary, "<IsGradient>");
- WriteBasicType(os, binary, is_gradient_);
- WriteToken(os, binary, "<InputScatter>");
- input_scatter_.Write(os, binary);
- WriteToken(os, binary, "<OutputScatter>");
- output_scatter_.Write(os, binary);
- WriteToken(os, binary, "<InC>");
- in_C_.Write(os, binary);
- WriteToken(os, binary, "<InCInv>");
- in_C_inv_.Write(os, binary);
- WriteToken(os, binary, "<OutC>");
- out_C_.Write(os, binary);
- WriteToken(os, binary, "<OutCInv>");
- out_C_inv_.Write(os, binary);
- WriteToken(os, binary, "<InvFisherIn>");
- inv_fisher_in_.Write(os, binary);
- WriteToken(os, binary, "<InvFisherOut>");
- inv_fisher_out_.Write(os, binary);
- WriteToken(os, binary, "</AffineComponentA>");
-}
-
-AffineComponentA::AffineComponentA(const AffineComponent &component):
- AffineComponent(component) { }
-
-
-void AffineComponentA::InitializeScatter() {
- KALDI_ASSERT(is_gradient_ &&
- "InitializeScatter should only be called on gradients.");
- KALDI_ASSERT(input_scatter_.NumRows() == 0 &&
- output_scatter_.NumRows() == 0 &&
- "InitializeScatter called when already initialized.");
- input_scatter_.Resize(InputDim() + 1); // + 1 because of the bias; we include
- // that in the input dimension.
- output_scatter_.Resize(OutputDim());
-}
-
-void AffineComponentA::Scale(BaseFloat scale) {
- linear_params_.Scale(scale);
- bias_params_.Scale(scale);
- input_scatter_.Scale(scale);
- output_scatter_.Scale(scale);
- // Remove all precomputed quantities, they'll be invalid.
- ClearPrecomputedQuantities();
-}
-
-void AffineComponentA::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
- const AffineComponentA *other =
- dynamic_cast<const AffineComponentA*>(&other_in);
- KALDI_ASSERT(other != NULL);
- linear_params_.AddMat(alpha, other->linear_params_);
- bias_params_.AddVec(alpha, other->bias_params_);
- input_scatter_.AddSp(alpha, other->input_scatter_);
- output_scatter_.AddSp(alpha, other->output_scatter_);
- // Remove all precomputed quantities, they'll be invalid.
- ClearPrecomputedQuantities();
-}
-
-Component* AffineComponentA::Copy() const {
- // The initializer below will be the one that takes AffineComponent,
- // so we need to take care of the remaining parameters.
- AffineComponentA *ans = new AffineComponentA(*this);
- ans->input_scatter_ = input_scatter_;
- ans->output_scatter_ = output_scatter_;
- return ans;
-}
-
-void AffineComponentA::ClearPrecomputedQuantities() {
- in_C_.Resize(0);
- in_C_inv_.Resize(0);
- out_C_.Resize(0);
- out_C_inv_.Resize(0);
- inv_fisher_in_.Resize(0);
- inv_fisher_out_.Resize(0);
-}
-
-void AffineComponentA::UpdateSimple(
- const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_deriv) {
- KALDI_ASSERT(this->is_gradient_);
- bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0);
- linear_params_.AddMatMat(learning_rate_, out_deriv, kTrans,
- in_value, kNoTrans, 1.0);
-
- // The rest of this function is about updating the scatters.
- if (input_scatter_.NumRows() != 0) { // scatter is to be accumulated..
- CuMatrix<double> in_value_dbl(in_value.NumCols() + 1,
- in_value.NumRows());
- in_value_dbl.Range(0, in_value.NumCols(),
- 0, in_value.NumRows()).CopyFromMat(in_value, kTrans);
- in_value_dbl.Row(in_value.NumCols()).Set(1.0);
- input_scatter_.AddMat2(1.0, in_value_dbl, kNoTrans, 1.0);
- }
- if (output_scatter_.NumRows() != 0) {
- CuMatrix<double> out_deriv_dbl(out_deriv, kTrans);
- output_scatter_.AddMat2(1.0, out_deriv_dbl, kNoTrans, 1.0);
- }
-}
-
-// static
-void AffineComponentA::ComputeTransforms(const CuSpMatrix<double> &scatter_in,
- const PreconditionConfig &config,
- double tot_count,
- CuTpMatrix<double> *C,
- CuTpMatrix<double> *C_inv) {
- CuSpMatrix<double> scatter(scatter_in);
- KALDI_ASSERT(scatter.Trace() > 0);
-
- scatter.Scale(1.0 / tot_count);
- // Smooth using "alpha"-- smoothing with the unit matrix.
-
- double d = config.alpha * scatter.Trace() / scatter.NumRows();
- for (int32 i = 0; i < scatter.NumRows(); i++)
- scatter(i, i) += d;
-
- C->Resize(scatter.NumRows());
- C->Cholesky(scatter);
- *C_inv = *C;
- C_inv->Invert();
-}
-/*
- // "transform" is now the cholesky factor C.
-
- // Now, the scatter may be viewed as a scatter of gradients (not parameters),
- // so call it S = \sum g g^T. [we omit the index on g.] We now have S = C
- // C^T. the transformed g would be g' = C^{-1} g. [this would make S' unit.]
- // If renormalize == true, we want to ensure that trace(C^{-1} S C^{-T}) equals
- // trace(S). This is a way of ensuring that the magnitude of the gradients is
- // about the same after transformation. Now, trace(C^{-1} S C^{-T}) is trace(I) =
- // dim(S). So to renormalize to make it equal to trace(S), we'd have to scale
- // by trace(S)/dim(S), which is equivalent to scaling C itself by
- // [trace(S)/dim(S)]^{-0.5}. Note: this assumes that alpha is small.
- // We may have to revisit this later
-
- if (config.renormalize)
- transform->Scale(pow(scatter.Trace() / scatter.NumRows(), -0.5));
-
- // Now take care of whether it should be inverted or not, and
- // transposed or not.
- if (is_gradient) {
- if (forward) transform->Invert(); // g' <-- C^{-1} g
- // else: g <-- C g'
- *trans = kNoTrans;
- } else {
- if (!forward) transform->Invert(); // p <-- C^{-T} p'
- // else: p' <-- C^T p
- *trans = kTrans;
- }
-}
-*/
-
-// static
-void AffineComponentA::ComputePreconditioner(const CuSpMatrix<double> &scatter_in,
- const PreconditionConfig &config,
- double tot_count,
- CuSpMatrix<double> *inv_fisher) {
- CuSpMatrix<double> scatter(scatter_in);
- KALDI_ASSERT(scatter.Trace() > 0);
-
- scatter.Scale(1.0 / tot_count);
- // Smooth using "alpha"-- smoothing with the unit matrix.
-
- double d = config.alpha * scatter.Trace() / scatter.NumRows();
- for (int32 i = 0; i < scatter.NumRows(); i++)
- scatter(i, i) += d;
-
- inv_fisher->Resize(scatter.NumRows());
- inv_fisher->CopyFromSp(scatter);
- inv_fisher->Invert();
-
- if (config.renormalize) {
- // renormalize so trace(inv_fisher . scatter) equals
- // trace(scatter . unit-matrix).
- inv_fisher->Scale(scatter.Trace() / TraceSpSp(*inv_fisher, scatter));
- }
-}
-
-
-void AffineComponentA::Transform(
- const PreconditionConfig &config,
- bool forward,
- AffineComponent *component) {
- if (!config.do_precondition) return; // There is nothing to do in this case.
- // (this option will probably only be used for testing.)
-
- KALDI_ASSERT(component != NULL);
-
- if (in_C_.NumRows() == 0) { // Need to pre-compute some things.
- double tot_count = input_scatter_(InputDim(), InputDim());
- // This equals the total count, because for each frame the last
- // element of the extended input vector is 1.
- ComputeTransforms(input_scatter_, config, tot_count, &in_C_, &in_C_inv_);
- ComputeTransforms(output_scatter_, config, tot_count, &out_C_, &out_C_inv_);
- }
-
- // "invert" is true if these two bools have the same value.
- bool is_gradient = component->is_gradient_,
- invert = (is_gradient == forward);
-
- // "params" are the parameters of "component" that we'll be changing.
- // Get them as a single matrix.
- CuMatrix<double> params(OutputDim(), InputDim() + 1);
- params.Range(0, OutputDim(), 0, InputDim()).CopyFromMat(
- component->linear_params_);
- params.CopyColFromVec(CuVector<double>(component->bias_params_),
- InputDim());
-
-
- MatrixTransposeType transpose_in = (is_gradient ? kTrans : kNoTrans);
-
- CuMatrix<double> params_temp(OutputDim(), InputDim() + 1);
- params_temp.AddMatTp(1.0, params, kNoTrans,
- invert ? in_C_inv_ : in_C_,
- transpose_in, 0.0);
-
- MatrixTransposeType transpose_out = (is_gradient ? kNoTrans : kTrans);
- params.AddTpMat(1.0, invert ? out_C_inv_ : out_C_, transpose_out,
- params_temp, kNoTrans, 0.0);
-
- // OK, we've done transforming the parameters or gradients.
-
- // Copy the "params" back to "component".
- component->linear_params_.CopyFromMat(
- params.Range(0, OutputDim(), 0, InputDim()));
- component->bias_params_.CopyColFromMat(params,
- InputDim());
-}
-
-
-void AffineComponentA::Precondition(
- const PreconditionConfig &config,
- AffineComponent *component) {
-
- if (!config.do_precondition) return; // There is nothing to do in this case.
- // (this option will probably only be used for testing.)
-
- KALDI_ASSERT(component != NULL);
-
- if (inv_fisher_in_.NumRows() == 0) { // Need to pre-compute some things.
- double tot_count = input_scatter_(InputDim(), InputDim());
- // This equals the total count, because for each frame the last
- // element of the extended input vector is 1.
- ComputePreconditioner(input_scatter_, config, tot_count, &inv_fisher_in_);
- ComputePreconditioner(output_scatter_, config, tot_count, &inv_fisher_out_);
- }
-
- // "params" are the parameters of "component" that we'll be changing.
- // Get them as a single matrix.
- CuMatrix<double> params(OutputDim(), InputDim() + 1);
- params.Range(0, OutputDim(), 0, InputDim()).CopyFromMat(
- component->linear_params_);
- params.CopyColFromVec(CuVector<double>(component->bias_params_),
- InputDim());
-
- CuMatrix<double> params_temp(OutputDim(), InputDim() + 1);
- params_temp.AddMatSp(1.0, params, kNoTrans, inv_fisher_in_, 0.0);
-
- params.AddSpMat(1.0, inv_fisher_out_, params_temp, kNoTrans, 0.0);
-
- // OK, we've done transforming the parameters or gradients.
- // Copy the "params" back to "component".
- component->linear_params_.CopyFromMat(
- params.Range(0, OutputDim(), 0, InputDim()));
- component->bias_params_.CopyColFromMat(params,
- InputDim());
-}
-
} // namespace nnet2
} // namespace kaldi
index f5bab2ba94dfc999cd5fcaa4bd18cb0e7783099e..f1eea9125b821000771f2a875bc08c2560048810 100644 (file)
const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow.
};
-/// Augments a scalar variable with powers of itself, e.g. x => {x, x^2}.
-class PowerExpandComponent: public Component {
- public:
- void Init(int32 dim, int32 max_power = 2, BaseFloat higher_power_scale = 1.0);
-
- explicit PowerExpandComponent(int32 dim, int32 max_power = 2,
- BaseFloat higher_power_scale = 1.0) {
- Init(dim, max_power, higher_power_scale);
- }
- PowerExpandComponent(): input_dim_(0), max_power_(2),
- higher_power_scale_(1.0) { }
- virtual std::string Type() const { return "PowerExpandComponent"; }
- virtual void InitFromString(std::string args);
- virtual int32 InputDim() const { return input_dim_; }
- virtual int32 OutputDim() const { return max_power_ * input_dim_; }
- virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
- int32 num_chunks,
- CuMatrix<BaseFloat> *out) const;
- virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &, // out_value
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32 num_chunks,
- Component *to_update, // may be identical to "this".
- CuMatrix<BaseFloat> *in_deriv) const;
- virtual bool BackpropNeedsInput() const { return true; }
- virtual bool BackpropNeedsOutput() const { return false; }
- virtual Component* Copy() const { return new PowerExpandComponent(input_dim_,
- max_power_,
- higher_power_scale_); }
-
- virtual void Read(std::istream &is, bool binary); // This Read function
- // requires that the Component has the correct type.
-
- /// Write component to stream
- virtual void Write(std::ostream &os, bool binary) const;
-
- virtual std::string Info() const;
- private:
- int32 input_dim_;
- int32 max_power_;
- BaseFloat higher_power_scale_; // Scale put on all powers
- // except the first one.
-};
-
-
/// This kind of Component is a base-class for things like
/// sigmoid and softmax.
class NonlinearComponent: public Component {
AffineComponent *c3);
protected:
friend class AffineComponentPreconditionedOnline;
- friend class AffineComponentA;
// This function Update() is for extensibility; child classes may override this.
virtual void Update(
const CuMatrixBase<BaseFloat> &in_value,
};
-/// PiecewiseLinearComponent is a kind of trainable version of the
-/// RectifiedLinearComponent, in which each dimension of the nonlinearity has a
-/// number of parameters that can be trained. it's of the form
-/// alpha + beta x + gamma_1 |x - c_1| + gamma_2 |x - c_2| + ... + gamma_N |x - c_N|
-/// where c_1 ... c_N on are constants (by default, equally
-/// spaced between -1 and 1), and the alpha, beta and gamma quantities are trainable.
-/// (Each dimension has separate alpha, beta and gamma quantities).
-/// We require that N be odd so that the "middle" gamma quantity corresponds
-/// to zero; this is for convenience of initialization so that it corresponds
-/// to ReLus.
-class PiecewiseLinearComponent: public UpdatableComponent {
- public:
- explicit PiecewiseLinearComponent(const PiecewiseLinearComponent &other);
- virtual int32 InputDim() const { return params_.NumRows(); }
- virtual int32 OutputDim() const { return params_.NumRows(); }
-
- void Init(int32 dim, int32 N,
- BaseFloat learning_rate,
- BaseFloat max_change);
-
- virtual std::string Info() const;
-
- virtual void InitFromString(std::string args);
-
- PiecewiseLinearComponent(): is_gradient_(false), max_change_(0.0) { } // use Init to really initialize.
-
- virtual std::string Type() const { return "PiecewiseLinearComponent"; }
- virtual bool BackpropNeedsInput() const { return true; }
- virtual bool BackpropNeedsOutput() const { return false; }
- virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
- int32 num_chunks,
- CuMatrix<BaseFloat> *out) const;
- virtual void Scale(BaseFloat scale);
- virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
- virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_value, // dummy
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32 num_chunks,
- Component *to_update, // may be identical to "this".
- CuMatrix<BaseFloat> *in_deriv) const;
- virtual void SetZero(bool treat_as_gradient);
- virtual void Read(std::istream &is, bool binary);
- virtual void Write(std::ostream &os, bool binary) const;
- virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
- virtual Component* Copy() const;
- virtual void PerturbParams(BaseFloat stddev);
-
- const CuMatrix<BaseFloat> &Params() { return params_; }
-
- virtual int32 GetParameterDim() const;
-
- virtual void Vectorize(VectorBase<BaseFloat> *params) const;
- virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
-
- protected:
- const PiecewiseLinearComponent &operator = (const PiecewiseLinearComponent &other); // Disallow.
- CuMatrix<BaseFloat> params_;
-
- bool is_gradient_; // If true, treat this as just a gradient.
- BaseFloat max_change_; // If nonzero, maximum change allowed per individual
- // parameter per minibatch.
-};
-
-
// This is an idea Dan is trying out, a little bit like
// preconditioning the update with the Fisher matrix, but the
// Fisher matrix has a special structure.
const CuMatrixBase<BaseFloat> &out_deriv);
};
-
-/// AffineComponentModified as as AffineComponent but we are careful about
-/// the lengths of rows of the parameter matrix, when we do the update.
-/// That means, for a given row, we first do an update along the direction of
-/// the existing vector; we then take the update orthogonal to that direction,
-/// but keep the length of the vector fixed.
-class AffineComponentModified: public AffineComponent {
- public:
- virtual std::string Type() const { return "AffineComponentModified"; }
-
- virtual void Read(std::istream &is, bool binary);
- virtual void Write(std::ostream &os, bool binary) const;
- void Init(BaseFloat learning_rate,
- int32 input_dim, int32 output_dim,
- BaseFloat param_stddev, BaseFloat bias_stddev,
- BaseFloat cutoff_length, BaseFloat max_change);
- void Init(BaseFloat learning_rate, BaseFloat cutoff_length,
- BaseFloat max_change, std::string matrix_filename);
-
- virtual void InitFromString(std::string args);
- virtual std::string Info() const;
- virtual Component* Copy() const;
- AffineComponentModified(): cutoff_length_(10.0), max_change_(0.1) { }
-
- private:
- KALDI_DISALLOW_COPY_AND_ASSIGN(AffineComponentModified);
-
- BaseFloat cutoff_length_; /// If the length of the vector corresponding to
- /// this row of the parameter matrix is less than this, we just do a regular
- /// gradient descent update. This would typically be less than
- /// sqrt(InputDim())-- a value smaller than the expected length of the
- /// parameter vector.
-
- BaseFloat max_change_; /// [if above the cutoff], this is the maximum
- /// change allowed in the vector per minibatch,
- /// as a proportion of the previous value. We separately
- /// apply this constraint to both the length and direction. Should
- /// be less than one, e.g. 0.1 or 0.01.
-
- virtual void Update(
- const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_deriv);
-};
-
-
class RandomComponent: public Component {
public:
// This function is required in testing code and in other places we need
CuRand<BaseFloat> random_generator_;
};
-
-
-struct PreconditionConfig { // relates to AffineComponentA
- BaseFloat alpha;
- bool do_precondition;
- bool renormalize;
-
- PreconditionConfig(): alpha(0.1), do_precondition(true),
- renormalize(true) { }
- void Register(OptionsItf *po) {
- po->Register("alpha", &alpha, "Smoothing constant used in "
- "preconditioning of updates.");
- po->Register("do-precondition", &do_precondition, "Controls whether "
- "or not preconditioning is applied in the L-BFGS update.");
- po->Register("renormalize", &renormalize, "If true, in the preconditioning "
- "we renormalize with a scalar so the projected scatter has the "
- "same trace as before preconditioning.");
- }
-};
-
-
-/**
- AffineComponentA is a special type of AffineComponent, that
- stores matrices for preconditioning similar to those used
- in the update function of AffineComponentPreconditioned. This is
- intended for use as a preconditioner in L-BFGS updates.
- In this case we optionally store the preconditioning
- information with the gradient information, in a separate
- copy of the component.
-*/
-class AffineComponentA: public AffineComponent {
- public:
- AffineComponentA() { }
-
- virtual std::string Type() const { return "AffineComponentA"; }
-
- virtual void Read(std::istream &is, bool binary);
- virtual void Write(std::ostream &os, bool binary) const;
-
- // There is no Init function for now; we only have the
- // ability to initialize from another AffineComponent (or child
- // class). This is because we imagine that the L-BFGS training
- // will be initialized from a system trained with SGD, for which
- // something like AffineComponentPreconditioned will be more
- // appropriate; we'll then convert the model.
- AffineComponentA(const AffineComponent &component);
-
- // We're not supporting initializing as this type.
- virtual void InitFromString(std::string args) { KALDI_ASSERT(0); }
- virtual Component* Copy() const;
-
- virtual void Scale(BaseFloat scale);
- virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
-
-
- // Some functions that are specific to this class:
- void InitializeScatter(); // Lets the class
- // know that it should accumulate the scatter matrix; sets
- // up input_scatter_ and output_scatter_.
-
-
- // This function uses the input_scatter_ and output_scatter_ variables of the
- // current class to transform the linear_params_ and bias_params_ variables of
- // "component". If forward == true then we transform to the preconditioned
- // space; otherwise we transform back from the preconditioned to the canonical
- // space. This is done differently depending if component->is_gradient_ ==
- // true, because gradients and parameters transform differently. The alpha
- // value relates to smoothing with the unit matrix; it's not defined in quite
- // the same way as for AffineComponentPreconditioned. See the code for
- // details.
- void Transform(const PreconditionConfig &config,
- bool forward,
- AffineComponent *component);
-
- // This function uses the input_scatter_ and output_scatter_ variables
- // current class to transform the linear_params_ and bias_params_ variables of
- // "component". It is equivalent to multiplying by the inverse Fisher,
- // or approximate inverse Hessian. It's the operation that you need
- // in optimization methods like L-BFGS, to transform from "gradient space"
- // into "model space".
- // Note: it's not const in this object, because we may cache stuff with the model.
- // See also the function "PreconditionNnet" in nnet-lbfgs.h, which
- // does this at the whole-neural-net level (by calling this function).
- void Precondition(const PreconditionConfig &config,
- AffineComponent *component);
-
- private:
-
- // The following variables are not used for the actual neural net, but
- // only when is_gradient_ == true (when it's being used to store gradients),
-
- CuSpMatrix<double> input_scatter_; // scatter of (input vectors extended with 1.)
- // This is only set up if this->is_gradient = true, and InitializeScatter()
- // has been called.
- CuSpMatrix<double> output_scatter_;
-
- // The following four quantities may be cached by the function "Transform",
- // to avoid duplicating work.
- CuTpMatrix<double> in_C_;
- CuTpMatrix<double> in_C_inv_;
- CuTpMatrix<double> out_C_;
- CuTpMatrix<double> out_C_inv_;
-
- // The following two quantities may be cached by the function "Precondition",
- // to avoid duplicating work.
- CuSpMatrix<double> inv_fisher_in_;
- CuSpMatrix<double> inv_fisher_out_;
-
- // This function computes the matrix (and corresponding transpose-ness) that
- // we'd left-multiply a vector by when transforming the parameter/gradient
- // space.
- static void ComputeTransforms(const CuSpMatrix<double> &scatter,
- const PreconditionConfig &config,
- double tot_count,
- CuTpMatrix<double> *C,
- CuTpMatrix<double> *C_inv);
-
- // This function is called by "Precondition"; it pre-computes
- // certain quantities we'll need.
- static void ComputePreconditioner(const CuSpMatrix<double> &scatter,
- const PreconditionConfig &config,
- double tot_count,
- CuSpMatrix<double> *inv_fisher);
-
- void ClearPrecomputedQuantities();
-
- // The following update function is called when *this is
- // a gradient. We only override this one.
- virtual void UpdateSimple(
- const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_deriv);
-};
-
-
/// Splices a context window of frames together [over time]
class SpliceComponent: public Component {
public:
};
-// Affine means a linear function plus an offset. PreconInput means we
-// precondition using the inverse of the variance of each dimension of the input
-// data. Note that this doesn't take into account any scaling of the samples,
-// but this doesn't really matter. This has some relation to AdaGrad, except
-// it's being done not per input dimension, rather than per parameter, and also
-// we multiply by a separately supplied and updated learning rate which will
-// typically vary with time. Note: avg_samples is the number of samples over
-// which we average the variance of the input data.
-class AffinePreconInputComponent: public AffineComponent {
- public:
- void Init(BaseFloat learning_rate,
- int32 input_dim, int32 output_dim,
- BaseFloat param_stddev,
- BaseFloat bias_stddev,
- BaseFloat avg_samples);
- virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_value, // dummy
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32 num_chunks,
- Component *to_update, // may be identical to "this".
- CuMatrix<BaseFloat> *in_deriv) const;
- AffinePreconInputComponent() { } // use Init to really initialize.
- virtual std::string Type() const { return "AffinePreconInputComponent"; }
- virtual void InitFromString(std::string args);
- virtual void SetZero(bool treat_as_gradient);
- virtual void Read(std::istream &is, bool binary);
- virtual void Write(std::ostream &os, bool binary) const;
- virtual Component* Copy() const;
- private:
- KALDI_DISALLOW_COPY_AND_ASSIGN(AffinePreconInputComponent);
- BaseFloat avg_samples_; // Config parameter; determines how many samples
- // we average the input feature variance over during training
- bool is_gradient_; // Set this to true if we consider this as a gradient.
- // In this case we don't do the input preconditioning.
-
- // Note: linear_params_ and bias_params_ are inherited from
- // AffineComponent.
- CuVector<BaseFloat> input_precision_; // Inverse variance of input features; used
- // to precondition the update.
-};
-
-
// Affine means a linear function plus an offset. "Block" means
// here that we support a number of equal-sized blocks of parameters,
};
-
-// MixtureProbComponent is a linear transform, but it's kind of a special case.
-// It's used to transform probabilities while retaining the sum-to-one
-// constraint (after the softmax), so we require nonnegative
-// elements that sum to one for each column. In addition, this component
-// implements a linear transformation that's a block matrix... not quite
-// block diagonal, because the component matrices aren't necessarily square.
-// They start off square, but as we mix up, they may get non-square.
-//
-// From its external interface, i.e. DotProduct(), Scale(), and Backprop(), if
-// you use this class in the expected way (e.g. only calling DotProduct()
-// between a gradient and the parameters), it behaves as if the parameters were
-// stored as unnormalized log-prob and the gradients were taken w.r.t. that
-// representation. This is the only way for the Scale() function to make sense.
-// In reality, the parameters are stored as probabilities (normalized to sum to
-// one for each row).
-
-class MixtureProbComponent: public UpdatableComponent {
- public:
- virtual int32 InputDim() const { return input_dim_; }
- virtual int32 OutputDim() const { return output_dim_; }
- void Init(BaseFloat learning_rate,
- BaseFloat diag_element,
- const std::vector<int32> &sizes);
- virtual void InitFromString(std::string args);
- MixtureProbComponent() { }
- virtual void SetZero(bool treat_as_gradient);
- virtual std::string Type() const { return "MixtureProbComponent"; }
- virtual bool BackpropNeedsInput() const { return true; }
- virtual bool BackpropNeedsOutput() const { return false; }
- virtual void Propagate(const CuMatrixBase<BaseFloat> &in,
- int32 num_chunks,
- CuMatrix<BaseFloat> *out) const;
- // Note: in_value and out_value are both dummy variables.
- virtual void Backprop(const CuMatrixBase<BaseFloat> &in_value,
- const CuMatrixBase<BaseFloat> &out_value,
- const CuMatrixBase<BaseFloat> &out_deriv,
- int32 num_chunks,
- Component *to_update, // may be identical to "this".
- CuMatrix<BaseFloat> *in_deriv) const;
- virtual Component* Copy() const;
-
- virtual void Read(std::istream &is, bool binary);
- virtual void Write(std::ostream &os, bool binary) const;
- virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
- virtual void Scale(BaseFloat scale);
- virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
- virtual void PerturbParams(BaseFloat stddev);
-
- virtual int32 GetParameterDim() const;
- virtual void Vectorize(VectorBase<BaseFloat> *params) const;
- virtual void UnVectorize(const VectorBase<BaseFloat> ¶ms);
- private:
- void Refresh(); // Refreshes params_ from log_params_.
- KALDI_DISALLOW_COPY_AND_ASSIGN(MixtureProbComponent);
-
- std::vector<CuMatrix<BaseFloat> > log_params_; // these are the
- // underlying parameters that are subject to gradient descent.
- std::vector<CuMatrix<BaseFloat> > params_; // these are derived from
- // log_params_.
- int32 input_dim_;
- int32 output_dim_;
-};
-
-
// SumGroupComponent is used to sum up groups of posteriors.
// It's used to introduce a kind of Gaussian-mixture-model-like
// idea into neural nets. This is basically a degenerate case of