summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (from parent 1: d1016d8)
raw | patch | inline | side by side (from parent 1: d1016d8)
author | pegahgh <pegahgh@gmail.com> | |
Fri, 15 Sep 2017 22:05:16 +0000 (18:05 -0400) | ||
committer | Daniel Povey <dpovey@gmail.com> | |
Fri, 15 Sep 2017 22:05:16 +0000 (18:05 -0400) |
20 files changed:
diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index ecafb588cfea373c683d23dafa2608d9ff7c5ace..a8156e10e14e3f1da7fd58dd1aebd42dddd2927b 100644 (file)
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -234,6 +234,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
%WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0
%WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0
+### WSJ->RM Transfer learning using chain model ###
+%WER 1.68 [ 210 / 12533, 25 ins, 33 del, 152 sub ] exp/chain/tdnn_wsj_rm_1a/decode/wer_2_0.0
+
### nnet1 results ###
# dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015)
diff --git a/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh b/egs/rm/s5/local/chain/run_tdnn_wsj_rm.sh
--- /dev/null
@@ -0,0 +1 @@
+tuning/run_tdnn_wsj_rm_1a.sh
\ No newline at end of file
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
--- /dev/null
@@ -0,0 +1,208 @@
+#!/bin/bash
+
+# This script uses weight transfer as a transfer learning method to transfer
+# already trained neural net model on wsj to rm.
+#
+# Model preparation: The last layer (prefinal and output layer) from
+# already-trained wsj model is removed and 3 randomly initialized layer
+# (new tdnn layer, prefinal, and output) are added to the model.
+#
+# Training: The transferred layers are retrained with smaller learning-rate,
+# while new added layers are trained with larger learning rate using rm data.
+# The chain config is as in run_tdnn_5n.sh and the result is:
+#System tdnn_5n tdnn_wsj_rm_1a
+#WER 2.71 1.68
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_wsj_rm_1a
+xent_regularize=0.1
+
+# configs for transfer learning
+src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # Input chain model
+ # trained on source dataset (wsj).
+ # This model is transfered to the target domain.
+
+src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim
+ # mfcc features for ivector and DNN training
+ # in the source domain.
+src_ivec_extractor_dir= # Source ivector extractor dir used to extract ivector for
+ # source data. The ivector for target data is extracted using this extractor.
+ # It should be nonempty, if ivector is used in the source model training.
+
+common_egs_dir=
+primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source
+ # model. e.g. if 0, the paramters transferred from source model
+ # are fixed.
+ # The learning-rate factor for new added layers is 1.0.
+
+nnet_affix=_online_wsj
+# End configuration section.
+
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+required_files="$src_mfcc_config $src_mdl"
+use_ivector=false
+ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2)
+if [ -z $ivector_dim ]; then ivector_dim=0 ; fi
+
+if [ ! -z $src_ivec_extractor_dir ]; then
+ if [ $ivector_dim -eq 0 ]; then
+ echo "$0: Source ivector extractor dir '$src_ivec_extractor_dir' is specified "
+ echo "but ivector is not used in training the source model '$src_mdl'."
+ else
+ required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie"
+ use_ivector=true
+ fi
+else
+ if [ $ivector_dim -gt 0 ]; then
+ echo "$0: ivector is used in training the source model '$src_mdl' but no "
+ echo " --src-ivec-extractor-dir option as ivector dir for source model is specified." && exit 1;
+ fi
+fi
+
+for f in $required_files; do
+ if [ ! -f $f ]; then
+ echo "$0: no such file $f." && exit 1;
+ fi
+done
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_5n_tree
+lang=data/lang_chain_5n
+
+local/online/run_nnet2_common.sh --stage $stage \
+ --ivector-dim $ivector_dim \
+ --nnet-affix "$nnet_affix" \
+ --mfcc-config $src_mfcc_config \
+ --extractor $src_ivec_extractor_dir || exit 1;
+
+if [ $stage -le 4 ]; then
+ # Get the alignments as lattices (gives the chain training more freedom).
+ # use the same num-jobs as the alignments
+ nj=$(cat $ali_dir/num_jobs) || exit 1;
+ steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+ data/lang exp/tri3b exp/tri3b_lats || exit 1;
+ rm exp/tri3b_lats/fsts.*.gz 2>/dev/null || true # save space
+fi
+
+if [ $stage -le 5 ]; then
+ # Create a version of the lang/ directory that has one state per phone in the
+ # topo file. [note, it really has two states.. the first one is only repeated
+ # once, the second one has zero or more repeats.]
+ rm -r $lang 2>/dev/null || true
+ cp -r data/lang $lang
+ silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+ nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+ # Use our special topology... note that later on may have to tune this
+ # topology.
+ steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+ # Build a tree using our new topology.
+ steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+ --leftmost-questions-truncate -1 \
+ --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+ echo "$0: Create neural net configs using the xconfig parser for";
+ echo " generating new layers, that are specific to rm. These layers ";
+ echo " are added to the transferred part of the wsj network.";
+ num_targets=$(tree-info --print-args=false $treedir/tree |grep num-pdfs|awk '{print $2}')
+ learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+ mkdir -p $dir
+ mkdir -p $dir/configs
+ cat <<EOF > $dir/configs/network.xconfig
+ relu-renorm-layer name=tdnn-target input=Append(tdnn6.renorm@-3,tdnn6.renorm) dim=450
+ ## adding the layers for chain branch
+ relu-renorm-layer name=prefinal-chain input=tdnn-target dim=450 target-rms=0.5
+ output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+ relu-renorm-layer name=prefinal-xent input=tdnn-target dim=450 target-rms=0.5
+ output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+ steps/nnet3/xconfig_to_configs.py --existing-model $src_mdl \
+ --xconfig-file $dir/configs/network.xconfig \
+ --config-dir $dir/configs/
+
+ # Set the learning-rate-factor to be primary_lr_factor for transferred layers "
+ # and adding new layers to them.
+ $train_cmd $dir/log/generate_input_mdl.log \
+ nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor" $src_mdl - \| \
+ nnet3-init --srand=1 - $dir/configs/final.config $dir/input.raw || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+ echo "$0: generate egs for chain to train new model on rm dataset."
+ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+ utils/create_split_dir.pl \
+ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+ fi
+ ivector_dir=
+ if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --trainer.input-model $dir/input.raw \
+ --feat.online-ivector-dir "$ivector_dir" \
+ --chain.xent-regularize $xent_regularize \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize 0.1 \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --chain.lm-opts="--num-extra-lm-states=200" \
+ --egs.dir "$common_egs_dir" \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width 150 \
+ --trainer.num-chunk-per-minibatch=128 \
+ --trainer.frames-per-iter 1000000 \
+ --trainer.num-epochs 2 \
+ --trainer.optimization.num-jobs-initial=2 \
+ --trainer.optimization.num-jobs-final=4 \
+ --trainer.optimization.initial-effective-lrate=0.005 \
+ --trainer.optimization.final-effective-lrate=0.0005 \
+ --trainer.max-param-change 2.0 \
+ --cleanup.remove-egs true \
+ --feat-dir data/train_hires \
+ --tree-dir $treedir \
+ --lat-dir exp/tri3b_lats \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ ivec_opt=""
+ if $use_ivector;then
+ ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test"
+ fi
+ utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --scoring-opts "--min-lmwt 1" \
+ --nj 20 --cmd "$decode_cmd" $ivec_opt \
+ $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
--- /dev/null
@@ -0,0 +1,221 @@
+#!/bin/bash
+# _1b is as _1a, but different as follows
+# 1) It uses wsj phone set phones.txt and new lexicon generated using word pronunciation
+# in swj lexincon.txt. rm words, that are not presented in wsj, are added as oov
+# in new lexicon.txt.
+# 2) It uses wsj tree-dir and generates new alignments and lattices for rm using
+# wsj gmm model.
+# 3) It also trains phone LM using weighted combination of alignemts from wsj
+# and rm, which is used in chain denominator graph.
+# Since we use phone.txt from source dataset, this can be helpful in cases
+# where there is few training data in the target domain and some 4-gram phone
+# sequences have no count in the target domain.
+# 4) It uses whole already-trained model and does not replace the output layer
+# from already-trained model with new randomely initialized output layer and
+# re-train it using target dataset.
+
+
+# This script uses weight transfer as a transfer learning method
+# and use already trained model on wsj and fine-tune the whole network using rm data
+# while training the last layer (output layer) with higher learning-rate.
+# The chain config is as run_tdnn_5n.sh and the result is:
+# System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c
+# WER 2.71 1.68 3.56 3.54
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-4
+get_egs_stage=-10
+tdnn_affix=_1b
+
+# configs for transfer learning
+common_egs_dir=
+primary_lr_factor=0.25 # The learning-rate factor for transferred layers from source
+ # model. e.g. if 0, it fixed the paramters transferred from source.
+ # The learning-rate factor for new added layers is 1.0.
+nnet_affix=_online_wsj
+phone_lm_scales="1,10" # comma-separated list of positive integer multiplicities
+ # to apply to the different source data directories (used
+ # to give the RM data a higher weight).
+
+# model and dirs for source model used for transfer learning
+src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # Input chain model
+ # trained on source dataset (wsj).
+ # This model is transfered to the target domain.
+
+src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim
+ # mfcc features for ivector and DNN training
+ # in the source domain.
+src_ivec_extractor_dir= # Source ivector extractor dir used to extract ivector for
+ # source data and the ivector for target data is extracted using this extractor.
+ # It should be nonempty, if ivector is used in source model training.
+
+src_lang=../../wsj/s5/data/lang # Source lang directory used to train source model.
+ # new lang dir for transfer learning experiment is prepared
+ # using source phone set phones.txt and lexicon.txt
+ # in src lang and dict dirs and words.txt in target lang dir.
+
+src_dict=../../wsj/s5/data/local/dict_nosp # dictionary for source dataset containing lexicon.txt,
+ # nonsilence_phones.txt,...
+ # lexicon.txt used to generate lexicon.txt for
+ # src-to-tgt transfer.
+
+src_gmm_dir=../../wsj/s5/exp/tri4b # source gmm dir used to generate alignments
+ # for target data.
+
+src_tree_dir=../../wsj/s5/exp/chain/tree_a_sp # chain tree-dir for src data;
+ # the alignment in target domain is
+ # converted using src-tree
+
+# End configuration section.
+
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+# dirs for src-to-tgt transfer learning experiment
+lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from
+ # WSJ and wordlist and G.fst from RM.
+lat_dir=exp/tri3b_lats_wsj
+dir=exp/chain/tdnn_wsj_rm${tdnn_affix}
+
+
+required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon.txt $src_gmm_dir/final.mdl $src_tree_dir/tree"
+
+
+use_ivector=false
+ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2)
+if [ -z $ivector_dim ]; then ivector_dim=0 ; fi
+
+if [ ! -z $src_ivec_extractor_dir ]; then
+ if [ $ivector_dim -eq 0 ]; then
+ echo "$0: Source ivector extractor dir '$src_ivec_extractor_dir' is specified "
+ echo "but ivector is not used in training the source model '$src_mdl'."
+ else
+ required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie"
+ use_ivector=true
+ fi
+else
+ if [ $ivector_dim -gt 0 ]; then
+ echo "$0: ivector is used in training the source model '$src_mdl' but no "
+ echo " --src-ivec-extractor-dir option as ivector dir for source model is specified." && exit 1;
+ fi
+fi
+
+
+for f in $required_files; do
+ if [ ! -f $f ]; then
+ echo "$0: no such file $f" && exit 1;
+ fi
+done
+
+if [ $stage -le -1 ]; then
+ echo "$0: Prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list."
+ if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" data/lang/phones.txt); then
+ local/prepare_wsj_rm_lang.sh $src_dict $src_lang $lang_src_tgt
+ else
+ rm -rf $lang_src_tgt 2>/dev/null || true
+ cp -r data/lang $lang_src_tgt
+ fi
+fi
+
+local/online/run_nnet2_common.sh --stage $stage \
+ --ivector-dim $ivector_dim \
+ --nnet-affix "$nnet_affix" \
+ --mfcc-config $src_mfcc_config \
+ --extractor $src_ivec_extractor_dir || exit 1;
+
+if [ $stage -le 4 ]; then
+ # Get the alignments as lattices (gives the chain training more freedom).
+ # use the same num-jobs as the alignments
+ nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+ steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
+ --generate-ali-from-lats true \
+ data/train $lang_src_tgt $src_gmm_dir $lat_dir || exit 1;
+ rm $lat_dir/fsts.*.gz 2>/dev/null || true # save space
+fi
+
+if [ $stage -le 5 ]; then
+ # Set the learning-rate-factor for all transferred layers but the last output
+ # layer to primary_lr_factor.
+ $train_cmd $dir/log/generate_input_mdl.log \
+ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \
+ $src_mdl $dir/input.raw || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+ echo "$0: compute {den,normalization}.fst using weighted phone LM with wsj and rm weight $phone_lm_scales."
+ steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \
+ --num-repeats $phone_lm_scales \
+ --lm-opts '--num-extra-lm-states=200' \
+ $src_tree_dir $lat_dir $dir || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+ utils/create_split_dir.pl \
+ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+ fi
+ # exclude phone_LM and den.fst generation training stages
+ if [ $train_stage -lt -4 ]; then train_stage=-4 ; fi
+
+ ivector_dir=
+ if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+ --cmd "$decode_cmd" \
+ --trainer.input-model $dir/input.raw \
+ --feat.online-ivector-dir "$ivector_dir" \
+ --chain.xent-regularize 0.1 \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize 0.1 \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --egs.dir "$common_egs_dir" \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width 150 \
+ --trainer.num-chunk-per-minibatch=128 \
+ --trainer.frames-per-iter 1000000 \
+ --trainer.num-epochs 2 \
+ --trainer.optimization.num-jobs-initial=2 \
+ --trainer.optimization.num-jobs-final=4 \
+ --trainer.optimization.initial-effective-lrate=0.005 \
+ --trainer.optimization.final-effective-lrate=0.0005 \
+ --trainer.max-param-change 2 \
+ --cleanup.remove-egs true \
+ --feat-dir data/train_hires \
+ --tree-dir $src_tree_dir \
+ --lat-dir $lat_dir \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ ivec_opt=""
+ if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi
+ utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --scoring-opts "--min-lmwt 1" \
+ --nj 20 --cmd "$decode_cmd" $ivec_opt \
+ $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
--- /dev/null
@@ -0,0 +1,229 @@
+#!/bin/bash
+# _1c is as _1b but it uses source chain-trained DNN model instead of GMM model
+# to generate alignments for RM using WSJ model.
+
+# _1b is as _1a, but different as follows
+# 1) It uses wsj phone set phones.txt and new lexicon generated using word pronunciation
+# in wsj lexicon.txt. rm words, that are not presented in wsj, are added as oov
+# in new lexicon.txt.
+# 2) It uses wsj tree-dir and generates new alignments and lattices for rm using
+# wsj gmm model.
+# 3) It also trains phone LM using weighted combination of alignemts from wsj
+# and rm, which is used in chain denominator graph.
+# Since we use phone.txt from source dataset, this can be helpful in cases
+# where there is a few training data in the target domain and some 4-gram phone
+# sequences have no count in the target domain.
+# 4) It transfers all layers in already-trained model and
+# re-train the last layer using target dataset, instead of replacing it
+# with new randomely initialized output layer.
+
+# This script uses weight transfer as Transfer learning method
+# and use already trained model on wsj and fine-tune the whole network using
+# rm data while training the last layer with higher learning-rate.
+# The chain config is as run_tdnn_5n.sh and the result is:
+# System tdnn_5n tdnn_wsj_rm_1a tdnn_wsj_rm_1b tdnn_wsj_rm_1c
+# WER 2.71 1.68 3.56 3.54
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-4
+get_egs_stage=-10
+dir=exp/chain/tdnn_wsj_rm_1c
+
+# configs for transfer learning
+
+common_egs_dir=
+primary_lr_factor=0.25 # learning-rate factor for all except last layer in transferred source model
+nnet_affix=_online_wsj
+
+phone_lm_scales="1,10" # comma-separated list of positive integer multiplicities
+ # to apply to the different source data directories (used
+ # to give the RM data a higher weight).
+
+# model and dirs for source model used for transfer learning
+src_mdl=../../wsj/s5/exp/chain/tdnn1d_sp/final.mdl # input chain model
+ # trained on source dataset (wsj) and
+ # this model is transfered to the target domain.
+
+src_mfcc_config=../../wsj/s5/conf/mfcc_hires.conf # mfcc config used to extract higher dim
+ # mfcc features used for ivector training
+ # in source domain.
+src_ivec_extractor_dir= # source ivector extractor dir used to extract ivector for
+ # source data and the ivector for target data is extracted using this extractor.
+ # It should be nonempty, if ivector is used in source model training.
+
+src_lang=../../wsj/s5/data/lang # source lang directory used to train source model.
+ # new lang dir for transfer learning experiment is prepared
+ # using source phone set phone.txt and lexicon.txt in src lang dir and
+ # word.txt target lang dir.
+src_dict=../../wsj/s5/data/local/dict_nosp # dictionary for source dataset containing lexicon.txt,
+ # nonsilence_phones.txt,...
+ # lexicon.txt used to generate lexicon.txt for
+ # src-to-tgt transfer.
+
+src_tree_dir=../../wsj/s5/exp/chain/tree_a_sp # chain tree-dir for src data;
+ # the alignment in target domain is
+ # converted using src-tree
+
+# End configuration section.
+
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+# dirs for src-to-tgt transfer experiment
+lang_dir=data/lang_chain_5n # lang dir for target data.
+lang_src_tgt=data/lang_wsj_rm # This dir is prepared using phones.txt and lexicon from
+ # source(WSJ) and and wordlist and G.fst from target(RM)
+lat_dir=exp/chain_lats_wsj
+
+required_files="$src_mfcc_config $src_mdl $src_lang/phones.txt $src_dict/lexicon.txt $src_tree_dir/tree"
+
+use_ivector=false
+ivector_dim=$(nnet3-am-info --print-args=false $src_mdl | grep "ivector-dim" | cut -d" " -f2)
+if [ -z $ivector_dim ]; then ivector_dim=0 ; fi
+
+if [ ! -z $src_ivec_extractor_dir ]; then
+ if [ $ivector_dim -eq 0 ]; then
+ echo "$0: Source ivector extractor dir '$src_ivec_extractor_dir' is "
+ echo "specified but ivector is not used in training the source model '$src_mdl'."
+ else
+ required_files="$required_files $src_ivec_extractor_dir/final.dubm $src_ivec_extractor_dir/final.mat $src_ivec_extractor_dir/final.ie"
+ use_ivector=true
+ fi
+else
+ if [ $ivector_dim -gt 0 ]; then
+ echo "$0: ivector is used in training the source model '$src_mdl' but no "
+ echo " --src-ivec-extractor-dir option as ivector dir for source model is specified." && exit 1;
+ fi
+fi
+
+
+for f in $required_files; do
+ if [ ! -f $f ]; then
+ echo "$0: no such file $f" && exit 1;
+ fi
+done
+
+if [ $stage -le -1 ]; then
+ echo "$0: Prepare lang for RM-WSJ using WSJ phone set and lexicon and RM word list."
+ if ! cmp -s <(grep -v "^#" $src_lang/phones.txt) <(grep -v "^#" $lang_dir/phones.txt); then
+ local/prepare_wsj_rm_lang.sh $src_dict $src_lang $lang_src_tgt || exit 1;
+ else
+ rm -rf $lang_src_tgt 2>/dev/null || true
+ cp -r $lang_dir $lang_src_tgt
+ fi
+fi
+
+local/online/run_nnet2_common.sh --stage $stage \
+ --ivector-dim $ivector_dim \
+ --nnet-affix "$nnet_affix" \
+ --mfcc-config $src_mfcc_config \
+ --extractor $src_ivec_extractor_dir || exit 1;
+src_mdl_dir=`dirname $src_mdl`
+ivec_opt=""
+if $use_ivector;then ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors" ; fi
+
+if [ $stage -le 4 ]; then
+ # Get the alignments as lattices (gives the chain training more freedom).
+ # use the same num-jobs as the alignments
+ steps/nnet3/align_lats.sh --nj 100 --cmd "$train_cmd" $ivec_opt \
+ --generate-ali-from-lats true \
+ --acoustic-scale 1.0 --extra-left-context-initial 0 --extra-right-context-final 0 \
+ --frames-per-chunk 150 \
+ --scale-opts "--transition-scale=1.0 --self-loop-scale=1.0" \
+ data/train_hires $lang_src_tgt $src_mdl_dir $lat_dir || exit 1;
+ rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+ # Set the learning-rate-factor for all transferred layers but the last output
+ # layer to primary_lr_factor.
+ $train_cmd $dir/log/generate_input_mdl.log \
+ nnet3-am-copy --raw=true --edits="set-learning-rate-factor name=* learning-rate-factor=$primary_lr_factor; set-learning-rate-factor name=output* learning-rate-factor=1.0" \
+ $src_mdl $dir/input.raw || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+ echo "$0: compute {den,normalization}.fst using weighted phone LM."
+ steps/nnet3/chain/make_weighted_den_fst.sh --cmd "$train_cmd" \
+ --num-repeats $phone_lm_scales \
+ --lm-opts '--num-extra-lm-states=200' \
+ $src_tree_dir $lat_dir $dir || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+ utils/create_split_dir.pl \
+ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+ fi
+ # exclude phone_LM and den.fst generation training stage
+ if [ $train_stage -lt -4 ]; then train_stage=-4 ; fi
+
+ ivector_dir=
+ if $use_ivector; then ivector_dir="exp/nnet2${nnet_affix}/ivectors" ; fi
+
+ # we use chain model from source to generate lats for target and the
+ # tolerance used in chain egs generation using this lats should be 1 or 2 which is
+ # (source_egs_tolerance/frame_subsampling_factor)
+ # source_egs_tolerance = 5
+ chain_opts=(--chain.alignment-subsampling-factor=1 --chain.left-tolerance=1 --chain.right-tolerance=1)
+ steps/nnet3/chain/train.py --stage $train_stage ${chain_opts[@]} \
+ --cmd "$decode_cmd" \
+ --trainer.input-model $dir/input.raw \
+ --feat.online-ivector-dir "$ivector_dir" \
+ --chain.xent-regularize 0.1 \
+ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+ --chain.xent-regularize 0.1 \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.l2-regularize 0.00005 \
+ --chain.apply-deriv-weights false \
+ --egs.dir "$common_egs_dir" \
+ --egs.opts "--frames-overlap-per-eg 0" \
+ --egs.chunk-width 150 \
+ --trainer.num-chunk-per-minibatch=128 \
+ --trainer.frames-per-iter 1000000 \
+ --trainer.num-epochs 2 \
+ --trainer.optimization.num-jobs-initial=2 \
+ --trainer.optimization.num-jobs-final=4 \
+ --trainer.optimization.initial-effective-lrate=0.005 \
+ --trainer.optimization.final-effective-lrate=0.0005 \
+ --trainer.max-param-change 2.0 \
+ --cleanup.remove-egs true \
+ --feat-dir data/train_hires \
+ --tree-dir $src_tree_dir \
+ --lat-dir $lat_dir \
+ --dir $dir || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+ # Note: it might appear that this $lang directory is mismatched, and it is as
+ # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+ # the lang directory.
+ tes_ivec_opt=""
+ if $use_ivector;then test_ivec_opt="--online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test" ; fi
+
+ utils/mkgraph.sh --self-loop-scale 1.0 $lang_src_tgt $dir $dir/graph
+ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+ --scoring-opts "--min-lmwt 1" \
+ --nj 20 --cmd "$decode_cmd" $test_ivec_opt \
+ $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh
index 1cd8abfba5478c5fac0596ff62810b3711855db1..f1f194fea267095a4760d93d81e15f8100a975d6 100755 (executable)
#!/bin/bash
-
+# This script extracts mfcc features using mfcc_config and trains ubm model and
+# ivector extractor and extracts ivector for train and test.
. cmd.sh
stage=1
-
+nnet_affix=_online
+extractor=exp/nnet2${nnet_affix}/extractor
+ivector_dim=50
+mfcc_config=conf/mfcc_hires.conf
+use_ivector=true # If false, it skips training ivector extractor and
+ # ivector extraction stages.
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
num_threads=16
minibatch_size=128
parallel_opts="--num-threads $num_threads"
- dir=exp/nnet2_online/nnet
+ dir=exp/nnet2${nnet_affix}/nnet
fi
+train_set=train
+if [ $stage -le 0 ]; then
+ echo "$0: creating high-resolution MFCC features."
+ mfccdir=data/${train_set}_hires/data
-if [ $stage -le 1 ]; then
- mkdir -p exp/nnet2_online
- steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
- data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
+ for datadir in $train_set test; do
+ utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+
+ steps/make_mfcc.sh --nj 30 --mfcc-config $mfcc_config \
+ --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+ steps/compute_cmvn_stats.sh data/${datadir}_hires
+ utils/fix_data_dir.sh data/${datadir}_hires
+ done
fi
-if [ $stage -le 2 ]; then
- # use a smaller iVector dim (50) than the default (100) because RM has a very
- # small amount of data.
- steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
- --ivector-dim 50 \
- data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
+train_set=${train_set}_hires
+if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then
+ if [ $stage -le 1 ]; then
+ mkdir -p exp/nnet2${nnet_affix}
+ steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 40 --num-frames 200000 \
+ data/${train_set} 256 exp/tri3b exp/nnet2${nnet_affix}/diag_ubm
+ fi
+
+ if [ $stage -le 2 ]; then
+ # use a smaller iVector dim (50) than the default (100) because RM has a very
+ # small amount of data.
+ steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 40 \
+ --ivector-dim $ivector_dim \
+ data/${train_set} exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1;
+ fi
fi
-if [ $stage -le 3 ]; then
+if [ $stage -le 3 ] && [ $ivector_dim -gt 0 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
- steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
+ steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set} data/${train_set}_max2
+
+ steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 40 \
+ data/${train_set}_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1;
- steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
- data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
+ steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
+ data/test_hires $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
fi
diff --git a/egs/rm/s5/local/prepare_wsj_rm_lang.sh b/egs/rm/s5/local/prepare_wsj_rm_lang.sh
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/bash
+# Copyright 2017 Pegah Ghahremani
+
+# This script prepares a dictionary for wsj-to-rm transfer learning experiment,
+# which uses wsj phone set phones.txt, lexicon lexicon.txt and dict.
+# The new lexicon.txt are created for words in rm words.txt as follows:
+# 1) The lexicon are copied from wsj lexicon.txt for common words in wsj and rm.
+# 2) Words in rm that are not in the wsj lexicon are added
+# as oov to new lexicon.txt.
+# The oov word "<SPOKEN_NOISE>" in wsj is also added to words.txt and G.fst is
+# recompiled using updated word list.
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+ echo "Usage: local/prepare_wsj_rm_lang.sh <src-dict> <src-lang> <output-dir>"
+ echo "e.g:"
+ echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang_nosp data/wsj_rm_dir"
+fi
+
+src_dict=$1
+src_lang=$2
+output_dir=$3
+
+required_dict_files="$src_dict/lexicon.txt $src_dict/nonsilence_phones.txt $src_dict/silence_phones.txt $src_dict/optional_silence.txt $src_lang/oov.txt $src_lang/phones.txt"
+for f in $required_dict_files; do
+ if [ ! -f $f ]; then
+ echo "$0: file $f that is required for preparing lang does not exist." && exit 1;
+ fi
+done
+
+rm -r $output_dir 2>/dev/null || true
+mkdir -p $output_dir
+mkdir -p $output_dir/local
+# copy *phones.txt from source to target.
+cp -r $src_dict $output_dir/local/dict
+rm $output_dir/local/dict/lexicon*.txt
+
+oov_word=`cat $src_lang/oov.txt`
+# common word list in rm lexicon with lexicon in wsj
+comm -12 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \
+ <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \
+ sed -r "s/'/+/g" | sort > $output_dir/words_tmp.txt
+
+comm -23 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \
+ <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \
+ sed -r "s/'/+/g" | sort > $output_dir/words_only_tgt.txt
+
+# add oov_word to word list
+(echo "$oov_word"; cat $output_dir/words_tmp.txt) | sort > $output_dir/words_tgt_src.txt
+rm $output_dir/words_tmp.txt
+
+# we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm-wsj
+# using wsj phone sets. More than 90% of words in RM are in WSJ(950/994).
+cat $output_dir/words_tgt_src.txt | sed "s/\+/\'/g" | \
+utils/apply_map.pl --permissive $src_dict/lexicon.txt | \
+ paste <(cat $output_dir/words_tgt_src.txt) - > $output_dir/local/dict/lexicon_tgt_src.txt
+
+# extend lexicon.txt by adding only_tg words as oov.
+oov_phone=`grep "$oov_word" $src_dict/lexicon.txt | cut -d' ' -f2`
+cat $output_dir/local/dict/lexicon_tgt_src.txt <(sed 's/$/ SPN/g' $output_dir/words_only_tgt.txt) | sort -u > $output_dir/local/dict/lexicon.txt
+
+# prepare dictionary using new lexicon.txt for RM-SWJ.
+utils/prepare_lang.sh --phone-symbol-table $src_lang/phones.txt \
+ $output_dir/local/dict "$oov_word" $output_dir/local/lang_tmp $output_dir
+
+# Generate new G.fst using updated words list with added <SPOKEN_NOISE>
+fstcompile --isymbols=$output_dir/words.txt --osymbols=$output_dir/words.txt --keep_isymbols=false \
+ -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $output_dir/G.fst || exit 1;
index d0f4675cf8368180702ed666d84f1b9f2f765761..187d9bf56879dc1e180dccbac40cd971173dc4d7 100755 (executable)
# alignments of alternative pronunciations in them. Mainly intended
# as a precursor to CTC training for now.
-# Begin configuration section.
+# Begin configuration section.
stage=0
nj=4
cmd=run.pl
# gmm-latgen-faster defaults to may help.)
boost_silence=1.0 # factor by which to boost silence during alignment.
fmllr_update_type=full
+generate_ali_from_lats=false # If true, alingments generated from lattices.
# End configuration options.
echo "$0 $@" # Print the command line for logging
## because the other scripts write them without transition probs.
if [ $stage -le 0 ]; then
echo "$0: compiling training graphs"
- tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+ tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
$cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \
"ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
# Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
# alignment errors (however, it does have a default min-active=200 so this
# will tend to reduce alignment errors).
- # --allow_partial=false makes sure we reach the end of the decoding graph.
+ # --allow_partial=false makes sure we reach the end of the decoding graph.
# --word-determinize=false makes sure we retain the alternative pronunciations of
# words (including alternatives regarding optional silences).
# --lattice-beam=$beam keeps all the alternatives that were within the beam,
"ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi
-rm $dir/pre_ali.*.gz
+if [ $stage -le 4 ] && $generate_ali_from_lats; then
+ # If generate_alignments is true, ali.*.gz is generated in lats dir
+ $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
+ lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
+ ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz 2>/dev/null || true
echo "$0: done generating lattices from training transcripts."
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 52d97d9a0be6f4786dba2dab4047bd7123957942..d28629fa5d690c62588c000c1500ab731987157b 100644 (file)
common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
-def prepare_initial_acoustic_model(dir, run_opts, srand=-1):
- """ Adds the first layer; this will also add in the lda.mat and
- presoftmax_prior_scale.vec. It will also prepare the acoustic model
- with the transition model."""
-
- common_train_lib.prepare_initial_network(dir, run_opts,
- srand=srand)
+def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None):
+ """ This function adds the first layer; It will also prepare the acoustic
+ model with the transition model.
+ If 'input_model' is specified, no initial network preparation(adding
+ the first layer) is done and this model is used as initial 'raw' model
+ instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the
+ transition model.
+ """
+ if input_model is None:
+ common_train_lib.prepare_initial_network(dir, run_opts,
+ srand=srand)
# The model-format for a 'chain' acoustic model is just the transition
# model and then the raw nnet, so we can use 'cat' to create this, as
# before concatenating them.
common_lib.execute_command(
"""{command} {dir}/log/init_mdl.log \
- nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \
- {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
+ nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \
+ {dir}/0.mdl""".format(command=run_opts.command, dir=dir,
+ raw_mdl=(input_model if input_model is not None
+ else '{0}/0.raw'.format(dir))))
def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 1e7bd19cb8443e130b3dc0d95a76183f9e7b94c8..34a0f5bff5593ff98779a42743b6c5245599e0a8 100644 (file)
raise Exception('Error while parsing the file {0}'.format(var_file))
+def get_input_model_info(input_model):
+ """ This function returns a dictionary with keys "model_left_context" and
+ "model_right_context" and values equal to the left/right model contexts
+ for input_model.
+ This function is useful when using the --trainer.input-model option
+ instead of initializing the model using configs.
+ """
+ variables = {}
+ try:
+ out = common_lib.get_command_stdout("""nnet3-info {0} | """
+ """head -4 """.format(input_model))
+ # out looks like this
+ # left-context: 7
+ # right-context: 0
+ # num-parameters: 90543902
+ # modulus: 1
+ for line in out.split("\n"):
+ parts = line.split(":")
+ if len(parts) != 2:
+ continue
+ if parts[0].strip() == 'left-context':
+ variables['model_left_context'] = int(parts[1].strip())
+ elif parts[0].strip() == 'right-context':
+ variables['model_right_context'] = int(parts[1].strip())
+
+ except ValueError:
+ pass
+ return variables
+
+
def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id,
left_context, right_context,
left_context_initial=-1, right_context_final=-1):
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 26ebedace83a878896e424c54a7dc47c1541d7b9..72d32129716c2a4e34c41081b9f42ce12806d8b7 100644 (file)
if not xutils.is_valid_line_name(self.name):
raise RuntimeError("Invalid value: name={0}".format(
key_to_value['name']))
+
+ # It is possible to have two layers with a same name in 'all_layer', if
+ # the layer type for one of them is 'existing'.
+ # Layers of type 'existing' are corresponding to the component-node names
+ # in the existing model, which we are adding layers to them.
+ # 'existing' layers are not presented in any config file, and new layer
+ # with the same name can exist in 'all_layers'.
+ # e.g. It is possible to have 'output-node' with name 'output' in the
+ # existing model, which is added to all_layers using layer type 'existing',
+ # and 'output-node' of type 'output-layer' with the same name 'output' in
+ # 'all_layers'.
for prev_layer in all_layers:
- if self.name == prev_layer.name:
+ if (self.name == prev_layer.name and
+ prev_layer.layer_type is not 'existing'):
raise RuntimeError("Name '{0}' is used for more than one "
"layer.".format(self.name))
return ans
+class XconfigExistingLayer(XconfigLayerBase):
+ """
+ This class is used to internally convert component-nodes in an existing
+ model into lines like
+ 'existing name=tdnn1.affine dim=40'.
+
+ Layers of this type are not presented in any actual xconfig or config
+ files, but are created internally for all component nodes
+ in an existing neural net model to use as input to other layers in xconfig.
+ (i.e. get_model_component_info function, which is called in
+ steps/nnet3/xconfig_to_configs.py, parses the name and
+ dimension of component-nodes used in the existing model
+ using the nnet3-info and returns a list of 'existing' layers.)
+
+ This class is useful in cases like transferring existing model
+ and using {input, output, component}-nodes in this model as
+ input to new layers.
+ """
+
+ def __init__(self, first_token, key_to_value, prev_names=None):
+
+ assert first_token == 'existing'
+ XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+
+ def set_default_configs(self):
+ self.config = { 'dim': -1}
+
+ def check_configs(self):
+ if self.config['dim'] <= 0:
+ raise RuntimeError("Dimension of existing-layer '{0}'"
+ "should be positive.".format(self.name))
+
+ def get_input_descriptor_names(self):
+ return [] # there is no 'input' field in self.config.
+
+ def output_name(self, auxiliary_outputs=None):
+ # there are no auxiliary outputs as this layer will just pass the input
+ assert auxiliary_outputs is None
+ return self.name
+
+ def output_dim(self, auxiliary_outputs=None):
+ # there are no auxiliary outputs as this layer will just pass the input
+ assert auxiliary_outputs is None
+ return self.config['dim']
+
+ def get_full_config(self):
+ # unlike other layers the existing layers should not to be printed in
+ # any '*.config'
+ ans = []
+ return ans
+
+
+
def test_layers():
# for some config lines that should be printed the same way as they
# are read, check that this is the case.
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 5cd9f2beef1abd65bafb311d777d80f0047ec550..99e9b22c5df3d08c048cb6011497af22eb5f858d 100644 (file)
import libs.nnet3.xconfig.layers as xlayers
import libs.nnet3.xconfig.utils as xutils
+import libs.common as common_lib
# We have to modify this dictionary when adding new layers
"*** {0}".format(config_line))
raise
-# This function reads an xconfig file and returns it as a list of layers
+
+def get_model_component_info(model_filename):
+ """
+ This function reads existing model (*.raw or *.mdl) and returns array
+ of XconfigExistingLayer one per {input,output}-node or component-node
+ with same 'name' used in the raw model and 'dim' equal to 'output-dim'
+ for component-node and 'dim' for {input,output}-node.
+
+ e.g. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer
+ 'input-node name=ivector dim=100' ->
+ 'existing name=ivector dim=100'
+ 'component-node name=tdnn1.affine ... input-dim=1000 '
+ 'output-dim=500' ->
+ 'existing name=tdnn1.affine dim=500'
+ """
+
+ all_layers = []
+ try:
+ f = open(model_filename, 'r')
+ except Exception as e:
+ sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0],
+ model_filename,
+ repr(e)))
+
+ # use nnet3-info to get component names in the model.
+ out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """
+ """ """.format(model_filename))
+
+ # out contains all {output, input, component}-nodes used in model_filename
+ # It can parse lines in out like:
+ # i.e. input-node name=input dim=40
+ # component-node name=tdnn1.affine component=tdnn1.affine input=lda
+ # input-dim=300 output-dim=512
+ layer_names = []
+ key_to_value = dict()
+ for line in out.split("\n"):
+ parts = line.split(" ")
+ dim = -1
+ for field in parts:
+ key_value = field.split("=")
+ if len(key_value) == 2:
+ key = key_value[0]
+ value = key_value[1]
+ if key == "name": # name=**
+ layer_name = value
+ elif key == "dim": # for input-node
+ dim = int(value)
+ elif key == "output-dim": # for component-node
+ dim = int(value)
+
+ if layer_name is not None and layer_name not in layer_names:
+ layer_names.append(layer_name)
+ key_to_value['name'] = layer_name
+ assert(dim != -1)
+ key_to_value['dim'] = dim
+ all_layers.append(xlayers.XconfigExistingLayer('existing', key_to_value, all_layers))
+ if len(all_layers) == 0:
+ raise RuntimeError("{0}: model filename '{1}' is empty.".format(
+ sys.argv[0], model_filename))
+ f.close()
+ return all_layers
+
+
+# This function reads xconfig file and returns it as a list of layers
# (usually we use the variable name 'all_layers' elsewhere for this).
# It will die if the xconfig file is empty or if there was
# some error parsing it.
-def read_xconfig_file(xconfig_filename):
+# 'existing_layers' contains some layers of type 'existing' (layers which are not really
+# layers but are actual component node names from an existing neural net model
+# and created using get_model_component_info function).
+# 'existing' layers can be used as input to component-nodes in layers of xconfig file.
+def read_xconfig_file(xconfig_filename, existing_layers=[]):
try:
f = open(xconfig_filename, 'r')
except Exception as e:
break
# the next call will raise an easy-to-understand exception if
# it fails.
- this_layer = xconfig_line_to_object(line, all_layers)
+ this_layer = xconfig_line_to_object(line, existing_layers)
if this_layer is None:
continue # line was blank after removing comments.
all_layers.append(this_layer)
+ existing_layers.append(this_layer)
if len(all_layers) == 0:
raise RuntimeError("{0}: xconfig file '{1}' is empty".format(
sys.argv[0], xconfig_filename))
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
index 76477300884bf35a255a6537bc6d83fd10311bf6..9eae72871192de17f82d3c2559d6bb6e710ad562 100644 (file)
# Given a list of objects of type XconfigLayerBase ('all_layers'),
# including at least the layers preceding 'current_layer' (and maybe
# more layers), return the names of layers preceding 'current_layer'
+# other than layers of type 'existing', which corresponds to component-node
+# names from an existing model that we are adding layers to them.
# This will be used in parsing expressions like [-1] in descriptors
# (which is an alias for the previous layer).
def get_prev_names(all_layers, current_layer):
for layer in all_layers:
if layer is current_layer:
break
- prev_names.append(layer.get_name())
+
+ # The following if-statement is needed to handle the case where the
+ # the layer is an 'existing' layer, derived from an existing trained
+ # neural network supplied via the existing-model option, that we are
+ # adding layers to. In this case, these layers are not considered as
+ # layers preceding 'current_layer'.
+ if layer.layer_type is not 'existing':
+ prev_names.append(layer.get_name())
prev_names_set = set()
for name in prev_names:
if name in prev_names_set:
# This is a convenience function to parser the auxiliary output name from the
# full layer name
-
def split_layer_name(full_layer_name):
assert isinstance(full_layer_name, str)
split_name = full_layer_name.split('.')
for layer in all_layers:
if layer is current_layer:
break
+
+ # If 'all_layers' contains some 'existing' layers, i.e. layers which
+ # are not really layers but are actual component names from an existing
+ # neural net that we are adding components to, they may already be
+ # of the form 'xxx.yyy', e.g. 'tdnn1.affine'. In this case the name of
+ # the layer in 'all_layers' won't be just the 'xxx' part (e.g. 'tdnn1'),
+ # it will be the full thing, like 'tdnn1.affine'.
+ # We will also use the if-statement immediately below this comment for
+ # regular layers, e.g. where full_layer_name is something like 'tdnn2'.
+ # The if-statement below the next one, that uses
+ # auxiliary_output, will only be used in the (rare) case when we are
+ # using auxiliary outputs, e.g. 'lstm1.c'.
+ if layer.get_name() == full_layer_name:
+ return layer.output_dim()
+
if layer.get_name() == layer_name:
- if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None:
- raise RuntimeError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output))
+ if (not auxiliary_output in layer.auxiliary_outputs()
+ and auxiliary_output is not None):
+ raise RuntimeError("Layer '{0}' has no such auxiliary output:"
+ "'{1}' ({0}.{1})".format(layer_name,
+ auxiliary_output))
return layer.output_dim(auxiliary_output)
# No such layer was found.
if layer_name in [ layer.get_name() for layer in all_layers ]:
for layer in all_layers:
if layer is current_layer:
break
+
+ # The following if-statement is needed to handle the case where the
+ # layer is an 'existing' layer, derived from an existing trained
+ # neural network supplied via the --existing-model option, that we are
+ # adding layers to. In this case the name of the layer will actually
+ # be of the form xxx.yyy, e.g. 'tdnn1.affine'.
+ # The code path will also be taken for regular (non-'existing') layer
+ # names where the 'auxiliary_output' field is not used, which is actually
+ # the normal case (e.g. when 'full_layer_name' is 'lstm1',
+ # as opposed to, say, 'lstm1.c'
+ if layer.get_name() == full_layer_name:
+ return layer.output_name()
+
if layer.get_name() == layer_name:
- if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None:
- raise RuntimeError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(
+ if (not auxiliary_output in layer.auxiliary_outputs() and
+ auxiliary_output is not None):
+ raise RuntimeError("Layer '{0}' has no such auxiliary output: "
+ "'{1}' ({0}.{1})".format(
layer_name, auxiliary_output))
return layer.output_name(auxiliary_output)
# No such layer was found.
diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh
--- /dev/null
@@ -0,0 +1,177 @@
+#!/bin/bash
+# Copyright 2012 Brno University of Technology (Author: Karel Vesely)
+# 2013 Johns Hopkins University (Author: Daniel Povey)
+# 2015 Vijayaditya Peddinti
+# 2016 Vimal Manohar
+# 2017 Pegah Ghahremani
+# Apache 2.0
+
+# Computes training alignments using nnet3 DNN, with output to lattices.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+stage=-1
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=20
+transform_dir=
+iter=final
+frames_per_chunk=50
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+graphs_scp=
+generate_ali_from_lats=false # If true, alingments generated from lattices.
+# End configuration options.
+
+echo "$0 $@" # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+ echo "Usage: $0 [--transform-dir <transform-dir>] <data-dir> <lang-dir> <src-dir> <align-dir>"
+ echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
+ echo "main options (for others, see top of script file)"
+ echo " --config <config-file> # config containing options"
+ echo " --nj <nj> # number of parallel jobs"
+ echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+ exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split${nj}utt
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
+ split_data.sh --per-utt $data $nj || exit 1;
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+ steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+ extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
+ [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+## Set up features. Note: these are different from the normal features
+## because we have one rspecifier that has the features for the entire
+## training set, not separate ones for each batch.
+echo "$0: feature type is raw"
+
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+
+if [ ! -z "$transform_dir" ]; then
+ echo "$0: using transforms from $transform_dir"
+ [ ! -s $transform_dir/num_jobs ] && \
+ echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+ nj_orig=$(cat $transform_dir/num_jobs)
+
+ if [ ! -f $transform_dir/raw_trans.1 ]; then
+ echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)"
+ exit 1;
+ fi
+ if [ $nj -ne $nj_orig ]; then
+ # Copy the transforms into an archive with an index.
+ for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \
+ copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1;
+ feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |"
+ else
+ # number of jobs matches with alignment dir.
+ feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
+ fi
+fi
+
+ivector_opts=
+if [ ! -z "$online_ivector_dir" ]; then
+ ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+ ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+ # e.g. for 'chain' systems
+ frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+ frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+ cp $srcdir/frame_subsampling_factor $dir
+ if [ "$frame_subsampling_factor" -gt 1 ] && \
+ [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then
+ echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
+ echo "... but the scale opts are the defaults. You probably want"
+ echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'"
+ sleep 1
+ fi
+fi
+
+if [ ! -z "$graphs_scp" ]; then
+ if [ ! -f $graphs_scp ]; then
+ echo "Could not find graphs $graphs_scp" && exit 1
+ fi
+ tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
+ prog=compile-train-graphs-fsts
+else
+ tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+ prog=compile-train-graphs
+fi
+
+if [ $stage -le 0 ]; then
+ ## because nnet3-latgen-faster doesn't support adding the transition-probs to the
+ ## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs,
+ ## because the other scripts write them without transition probs.
+ $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+ $prog --read-disambig-syms=$lang/phones/disambig.int \
+ $scale_opts \
+ $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" \
+ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
+fi
+
+if [ $stage -le 1 ]; then
+ # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more
+ # alignment errors (however, it does have a default min-active=200 so this
+ # will tend to reduce alignment errors).
+ # --allow_partial=false makes sure we reach the end of the decoding graph.
+ # --word-determinize=false makes sure we retain the alternative pronunciations of
+ # words (including alternatives regarding optional silences).
+ # --lattice-beam=$beam keeps all the alternatives that were within the beam,
+ # it means we do no pruning of the lattice (lattices from a training transcription
+ # will be small anyway).
+ $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
+ nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \
+ --frames-per-chunk=$frames_per_chunk \
+ --extra-left-context=$extra_left_context \
+ --extra-right-context=$extra_right_context \
+ --extra-left-context-initial=$extra_left_context_initial \
+ --extra-right-context-final=$extra_right_context_final \
+ --beam=$beam --lattice-beam=$beam \
+ --allow-partial=false --word-determinize=false \
+ $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \
+ "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ] && $generate_ali_from_lats; then
+ # If generate_alignments is true, ali.*.gz is generated in lats dir
+ $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
+ lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
+ ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+echo "$0: done generating lattices from training transcripts."
diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh
--- /dev/null
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+# Copyright 2017 Vimal Manohar
+# 2017 Pegah Ghahremani
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script creates denominator FST (den.fst) and normalization.fst for
+# chain training. It additionally copies the transition model and tree from the
+# first alignment directory to the chain directory.
+# This script can accept multiple sources of alignments with same phone sets
+# that can be weighted to estimate phone LM.
+# You can use the --num-repeats option to repeat some source data more than
+# once when training the LM for the denominator FST.
+
+set -o pipefail
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+num_repeats= # Comma-separated list of positive integer multiplicities, one
+ # for each input alignment directory. The alignments from
+ # each source will be scaled by the corresponding value when
+ # training the LM.
+ # If not specified, weight '1' is used for all data sources.
+
+lm_opts='--num-extra-lm-states=2000'
+#end configuration section.
+
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 2 ]; then
+ echo "Usage: $0 [options] <ali-dir1> [<ali-dir2> ...] <out-dir>";
+ echo "e.g.: $0 exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp";
+ echo "Options: "
+ echo " --cmd (run.pl|queue.pl...) # Specify how to run jobs.";
+ echo "--lm-opts # Options for phone LM generation";
+ echo "--num-repeats # Comma-separated list of postive integer"
+ echo " # multiplicities, one for each input"
+ echo " # alignment directory. The alignments"
+ echo " # from each source will be scaled by"
+ echo " # the corresponding value when training"
+ echo " # the LM. If not specified, weight '1'"
+ echo " # is used for all data sources."
+ exit 1;
+fi
+
+dir=${@: -1} # the working directory: last argument to the script
+ali_dirs=( $@ ) # read the remaining arguments into an array
+unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is $dir
+num_alignments=${#ali_dirs[@]} # number of alignment dirs to combine
+
+mkdir -p $dir/log
+for n in `seq 0 $[$num_alignments-1]`;do
+ ali_dir=${ali_dirs[$n]}
+ for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do
+ [ ! -f $f ] && echo "$0: Expected file $f to exist" && exit 1;
+ done
+ utils/lang/check_phones_compatible.sh ${ali_dirs[0]}/phones.txt \
+ ${ali_dirs[$n]}/phones.txt || exit 1;
+done
+
+cp ${ali_dirs[0]}/tree $dir/ || exit 1
+
+if [ -z "$num_repeats" ]; then
+ # If 'num_repeats' is not specified, set num_repeats_array to e.g. (1 1 1).
+ num_repeats_array=( $(for n in $(seq $num_alignments); do echo 1; done) )
+else
+ num_repeats_array=(${num_repeats//,/ })
+ num_repeats=${#num_repeats_array[@]}
+ if [ $num_repeats -ne $num_alignments ]; then
+ echo "$0: too many or too few elements in --num-repeats option: '$num_repeats'"
+ exit 1
+ fi
+fi
+
+if [ $stage -le 1 ]; then
+ all_phones="" # will contain the names of the .gz files containing phones,
+ # with some members possibly repeated per the --num-repeats
+ # option
+ for n in `seq 0 $[num_alignments-1]`; do
+ this_num_repeats=${num_repeats_array[$n]}
+ this_alignment_dir=${ali_dirs[$n]}
+ num_jobs=$(cat $this_alignment_dir/num_jobs)
+ if ! [ "$this_num_repeats" -gt 0 ]; then
+ echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'"
+ exit 1
+ fi
+
+
+ for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \
+ ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1;
+
+ all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)"
+ done
+
+ $cmd $dir/log/make_phone_lm_fst.log \
+ gunzip -c $all_phones \| \
+ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1;
+ rm $dir/phones.*.gz
+fi
+
+if [ $stage -le 2 ]; then
+ copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+ $cmd $dir/log/make_den_fst.log \
+ chain-make-den-fst $dir/tree $dir/0.trans_mdl \
+ $dir/phone_lm.fst \
+ $dir/den.fst $dir/normalization.fst || exit 1
+fi
+
+echo "Successfully created {den,normalization}.fst"
+
+exit 0
index 55c0c25dfd55b2f83e82e9d27144fdf6a044e4ac..d23c379e104e76ea27abba3f6317cb76338f646b 100755 (executable)
help="Deprecated. Kept for back compatibility")
# trainer options
+ parser.add_argument("--trainer.input-model", type=str,
+ dest='input_model', default=None,
+ action=common_lib.NullstrToNoneAction,
+ help="If specified, this model is used as initial "
+ "'raw' model (0.raw in the script) instead of "
+ "initializing the model from the xconfig. "
+ "Also configs dir is not expected to exist "
+ "and left/right context is computed from this "
+ "model.")
parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs',
default=10.0,
help="Number of epochs to train the model")
"""
if not common_train_lib.validate_chunk_width(args.chunk_width):
- raise Exception("--egs.chunk-width has an invalid value");
+ raise Exception("--egs.chunk-width has an invalid value")
if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
- raise Exception("--trainer.num-chunk-per-minibatch has an invalid value");
+ raise Exception("--trainer.num-chunk-per-minibatch has an invalid value")
if args.chunk_left_context < 0:
raise Exception("--egs.chunk-left-context should be non-negative")
args.deriv_truncate_margin))
if (not os.path.exists(args.dir)
- or not os.path.exists(args.dir+"/configs")):
- raise Exception("This scripts expects {0} to exist and have a configs "
- "directory which is the output of "
- "make_configs.py script".format(
- args.dir))
+ or (not os.path.exists(args.dir+"/configs") and
+ not os.path.exists(args.input_model))):
+ raise Exception("This script expects {0} to exist. Also either "
+ "--trainer.input-model option as initial 'raw' model "
+ "(used as 0.raw in the script) should be supplied or "
+ "{0}/configs directory which is the output of "
+ "make_configs.py script should be provided."
+ "".format(args.dir))
if args.transform_dir is None:
args.transform_dir = args.lat_dir
# split the training data into parts for individual jobs
# we will use the same number of jobs as that used for alignment
- common_lib.execute_command("utils/split_data.sh {0} {1}".format(
- args.feat_dir, num_jobs))
- shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
+ common_lib.execute_command("utils/split_data.sh {0} {1}"
+ "".format(args.feat_dir, num_jobs))
with open('{0}/num_jobs'.format(args.dir), 'w') as f:
f.write(str(num_jobs))
- config_dir = '{0}/configs'.format(args.dir)
- var_file = '{0}/vars'.format(config_dir)
+ if args.input_model is None:
+ config_dir = '{0}/configs'.format(args.dir)
+ var_file = '{0}/vars'.format(config_dir)
- variables = common_train_lib.parse_generic_config_vars_file(var_file)
+ variables = common_train_lib.parse_generic_config_vars_file(var_file)
+ else:
+ # If args.input_model is specified, the model left and right contexts
+ # are computed using input_model.
+ variables = common_train_lib.get_input_model_info(args.input_model)
# Set some variables.
try:
if (args.stage <= -5):
logger.info("Creating denominator FST")
+ shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts)
- if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"):
+ if ((args.stage <= -4) and
+ os.path.exists("{0}/configs/init.config".format(args.dir))
+ and (args.input_model is None)):
logger.info("Initializing a basic network for estimating "
"preconditioning matrix")
common_lib.execute_command(
"""{command} {dir}/log/nnet_init.log \
- nnet3-init --srand=-2 {dir}/configs/init.config \
- {dir}/init.raw""".format(command=run_opts.command,
- dir=args.dir))
+ nnet3-init --srand=-2 {dir}/configs/init.config \
+ {dir}/init.raw""".format(command=run_opts.command,
+ dir=args.dir))
egs_left_context = left_context + args.frame_subsampling_factor / 2
egs_right_context = right_context + args.frame_subsampling_factor / 2
# note: the '+ args.frame_subsampling_factor / 2' is to allow for the
# fact that we'll be shifting the data slightly during training to give
# variety to the training data.
- egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if
+ egs_left_context_initial = (left_context_initial +
+ args.frame_subsampling_factor / 2 if
left_context_initial >= 0 else -1)
- egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if
+ egs_right_context_final = (right_context_final +
+ args.frame_subsampling_factor / 2 if
right_context_final >= 0 else -1)
default_egs_dir = '{0}/egs'.format(args.dir)
- if (args.stage <= -3) and args.egs_dir is None:
+ if ((args.stage <= -3) and args.egs_dir is None):
logger.info("Generating egs")
+ if (not os.path.exists("{0}/den.fst".format(args.dir)) or
+ not os.path.exists("{0}/normalization.fst".format(args.dir)) or
+ not os.path.exists("{0}/tree".format(args.dir))):
+ raise Exception("Chain egs generation expects {0}/den.fst, "
+ "{0}/normalization.fst and {0}/tree "
+ "to exist.".format(args.dir))
# this is where get_egs.sh is called.
chain_lib.generate_chain_egs(
dir=args.dir, data=args.feat_dir,
[egs_left_context, egs_right_context,
frames_per_eg_str, num_archives] = (
- common_train_lib.verify_egs_dir(egs_dir, feat_dim,
- ivector_dim, ivector_id,
- egs_left_context, egs_right_context,
- egs_left_context_initial,
- egs_right_context_final))
+ common_train_lib.verify_egs_dir(egs_dir, feat_dim,
+ ivector_dim, ivector_id,
+ egs_left_context, egs_right_context,
+ egs_left_context_initial,
+ egs_right_context_final))
assert(args.chunk_width == frames_per_eg_str)
num_archives_expanded = num_archives * args.frame_subsampling_factor
logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir))
common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
- if (args.stage <= -2) and os.path.exists(args.dir+"/configs/init.config"):
+ if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config"))
+ and (args.input_model is None)):
logger.info('Computing the preconditioning matrix for input features')
chain_lib.compute_preconditioning_matrix(
if (args.stage <= -1):
logger.info("Preparing the initial acoustic model.")
- chain_lib.prepare_initial_acoustic_model(args.dir, run_opts)
+ chain_lib.prepare_initial_acoustic_model(args.dir, run_opts,
+ input_model=args.input_model)
with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f:
f.write(str(args.frame_subsampling_factor))
if args.shrink_value < shrinkage_value:
shrinkage_value = (args.shrink_value
if common_train_lib.should_do_shrinkage(
- iter, model_file,
- args.shrink_saturation_threshold)
+ iter, model_file,
+ args.shrink_saturation_threshold)
else shrinkage_value)
percent = num_archives_processed * 100.0 / num_archives_to_process
logger.info("Copying the last-numbered model to final.mdl")
common_lib.force_symlink("{0}.mdl".format(num_iters),
"{0}/final.mdl".format(args.dir))
- common_lib.force_symlink("compute_prob_valid.{iter}.log".format(
- iter=num_iters-1),
+ common_lib.force_symlink("compute_prob_valid.{iter}.log"
+ "".format(iter=num_iters-1),
"{dir}/log/compute_prob_valid.final.log".format(
dir=args.dir))
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index d582db77a72c162d699e4e30c6c3a1f96bf0f38d..d74135e5980e9575e05ebfa87b01d8eb096c75df 100755 (executable)
epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
parser.add_argument('--xconfig-file', required=True,
help='Filename of input xconfig file')
+ parser.add_argument('--existing-model',
+ help='Filename of previously trained neural net '
+ '(e.g. final.mdl) which is useful in case of '
+ 'using nodes from list of component-nodes in '
+ 'already trained model '
+ 'to generate new config file for new model.'
+ 'The context info is also generated using '
+ 'a model generated by adding final.config '
+ 'to the existing model.'
+ 'e.g. In Transfer learning: generate new model using '
+ 'component nodes in existing model.')
parser.add_argument('--config-dir', required=True,
help='Directory to write config files and variables')
parser.add_argument('--nnet-edits', type=str, default=None,
try:
xconfig_file_in = open(xconfig_file)
except:
- raise Exception('{0}: error opening file {1} for input'.format(
- sys.argv[0], config_dir))
+ raise Exception('{0}: error opening file {1} for input'
+ ''.format(sys.argv[0], config_dir))
print("# This file was created by the command:\n"
"# {0}\n"
if basename == 'init':
continue # do not write the init.config
else:
- print('{0}: error in xconfig file {1}: may be lack of a output layer'.format(
- sys.argv[0], sys.argv[2]), file=sys.stderr)
+ print('{0}: error in xconfig file {1}: may be lack of a '
+ 'output layer'.format(sys.argv[0], sys.argv[2]),
+ file=sys.stderr)
raise
header = config_basename_to_header[basename]
print(line, file=f)
f.close()
except Exception as e:
- print('{0}: error writing to config file {1}: error is {2}'.format(
- sys.argv[0], filename, repr(e)), file=sys.stderr)
+ print('{0}: error writing to config file {1}: error is {2}'
+ ''.format(sys.argv[0], filename, repr(e)), file=sys.stderr)
# we use raise rather than raise(e) as using a blank raise
# preserves the backtrace
raise
-def add_nnet_context_info(config_dir, nnet_edits=None):
+def add_nnet_context_info(config_dir, nnet_edits=None,
+ existing_model=None):
"""Create the 'vars' file that specifies model_left_context, etc."""
- common_lib.execute_command("nnet3-init {0}/ref.config "
- "{0}/ref.raw".format(config_dir))
+ common_lib.execute_command("nnet3-init {0} {1}/ref.config "
+ "{1}/ref.raw"
+ "".format(existing_model if
+ existing_model is not None else "",
+ config_dir))
model = "{0}/ref.raw".format(config_dir)
if nnet_edits is not None:
model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
vf.write('model_right_context={0}\n'.format(info['right-context']))
vf.close()
-def check_model_contexts(config_dir, nnet_edits=None):
+def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
contexts = {}
for file_name in ['init', 'ref']:
if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)):
contexts[file_name] = {}
- common_lib.execute_command("nnet3-init {0}/{1}.config "
- "{0}/{1}.raw".format(config_dir, file_name))
+ common_lib.execute_command("nnet3-init {0} {1}/{2}.config "
+ "{1}/{2}.raw"
+ "".format(existing_model if
+ existing_model is not
+ None else '',
+ config_dir, file_name))
model = "{0}/{1}.raw".format(config_dir, file_name)
if nnet_edits is not None:
model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
def main():
args = get_args()
backup_xconfig_file(args.xconfig_file, args.config_dir)
- all_layers = xparser.read_xconfig_file(args.xconfig_file)
+ existing_layers = []
+ if args.existing_model is not None:
+ existing_layers = xparser.get_model_component_info(args.existing_model)
+ all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
write_expanded_xconfig_files(args.config_dir, all_layers)
write_config_files(args.config_dir, all_layers)
- check_model_contexts(args.config_dir, args.nnet_edits)
- add_nnet_context_info(args.config_dir, args.nnet_edits)
+ check_model_contexts(args.config_dir, args.nnet_edits,
+ existing_model=args.existing_model)
+ add_nnet_context_info(args.config_dir, args.nnet_edits,
+ existing_model=args.existing_model)
if __name__ == '__main__':
index f94228a1c6a5ada7f384cce26332da7b6d60ca37..4cbdcc937c40b4eeedc579a223184dacf1ab98dd 100644 (file)
/// learning_rate_factor_.
virtual void SetAsGradient() { learning_rate_ = 1.0; is_gradient_ = true; }
+ // Sets the learning rate factors to lrate_factor.
+ virtual void SetLearningRateFactor(BaseFloat lrate_factor) {
+ learning_rate_factor_ = lrate_factor;
+ }
+
/// freezes/unfreezes NaturalGradient updates, if applicable (to be overriden
/// by components that use Natural Gradient).
virtual void FreezeNaturalGradient(bool freeze) { }
index 4b230b7fdb967a9e5f5d84289fdda39093e85f3d..6df13cd645efab8e541f717d09005c1e0bfa0751 100644 (file)
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
num_learning_rates_set++;
}
}
- KALDI_LOG << "Set learning rates for " << num_learning_rates_set << " nodes.";
+ KALDI_LOG << "Set learning rates for " << num_learning_rates_set << " components.";
+ } else if (directive == "set-learning-rate-factor") {
+ std::string name_pattern = "*";
+ // name_pattern defaults to '*' if none is given.
+ config_line.GetValue("name", &name_pattern);
+ BaseFloat learning_rate_factor = -1;
+ if (!config_line.GetValue("learning-rate-factor", &learning_rate_factor)) {
+ KALDI_ERR << "In edits-config, expected learning-rate-factor to be set in line: "
+ << config_line.WholeLine();
+ }
+ // Note: the learning_rate_factor_ defined in the component
+ // sets to the value you provided, so if you call SetUnderlyingLearningRate(),
+ // the actual learning rate (learning_rate_) is set to the value you provided
+ // times learning_rate.
+ UpdatableComponent *component = NULL;
+ int32 num_learning_rate_factors_set = 0;
+ for (int32 c = 0; c < nnet->NumComponents(); c++) {
+ if (NameMatchesPattern(nnet->GetComponentName(c).c_str(),
+ name_pattern.c_str()) &&
+ (component =
+ dynamic_cast<UpdatableComponent*>(nnet->GetComponent(c)))) {
+ component->SetLearningRateFactor(learning_rate_factor);
+ num_learning_rate_factors_set++;
+ }
+ }
+ KALDI_LOG << "Set learning rate factors for " << num_learning_rate_factors_set
+ << " components.";
} else if (directive == "rename-node") {
// this is a shallow renaming of a node, and it requires that the name used is
// not the name of another node.
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 54428deeb3ddb407cb539d068d7e4dba51b01d29..c8c371b2da8d5fca9bece026eb8fc62172e98308 100644 (file)
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
The same as calling remove-orphan-nodes and then remove-orphan-components.
set-learning-rate [name=<name-pattern>] learning-rate=<learning-rate>
- Sets the learning rate for any updatable nodes matching the name pattern.
+ Sets the learning rate for any updatable components matching the name pattern.
Note: this sets the 'underlying' learning rate, i.e. it will get
- multiplied by any 'learning-rate-factor' set in the nodes.
+ multiplied by any 'learning-rate-factor' set in the components.
+
+ set-learning-rate-factor [name=<name-pattern>] learning-rate-factor=<learning-rate-factor>
+ Sets the learning rate factor for any updatable components matching the name pattern.
rename-node old-name=<old-name> new-name=<new-name>
Renames a node; this is a surface renaming that does not affect the structure