1 #!/bin/bash
3 # In this recipe everything is the same as tdnn_7k, except the
4 # 7th TDNN layer has been replaced with an attention layer
6 # local/chain/compare_wer_general.sh exp/chain/tdnn_7k_sp exp/chain/tdnn_attend_1a_sp
7 # System tdnn_7k_sp tdnn_attend_1a_sp
8 # WER on train_dev(tg) 13.93 13.76
9 # WER on train_dev(fg) 12.85 12.62
10 # WER on eval2000(tg) 16.7 16.2
11 # WER on eval2000(fg) 15.0 14.5
12 # Final train prob -0.085 -0.076
13 # Final valid prob -0.106 -0.098
14 # Final train prob (xent) -1.260 -0.997
15 # Final valid prob (xent) -1.3193 -1.0887
17 # steps/info/chain_dir_info.pl exp/chain/tdnn_attend_1a_sp
18 # exp/chain/tdnn_attend_1a_sp/: num-iters=262 nj=3..16 num-params=16.8M dim=40+100->6076 combine=-0.095->-0.095 xent:train/valid[173,261,final]=(-1.06,-0.993,-0.997/-1.14,-1.09,-1.09) logprob:train/valid[173,261,final]=(-0.084,-0.076,-0.076/-0.104,-0.099,-0.098)
20 # steps/info/chain_dir_info.pl exp/chain/tdnn_7k_sp
21 # exp/chain/tdnn_7k_sp: num-iters=262 nj=3..16 num-params=15.6M dim=40+100->6076 combine=-0.106->-0.106 xent:train/valid[173,261,final]=(-1.32,-1.25,-1.26/-1.36,-1.31,-1.32) logprob:train/valid[173,261,final]=(-0.093,-0.085,-0.085/-0.110,-0.106,-0.106)
23 set -e
25 # configs for 'chain'
26 affix=1a
27 stage=12
28 train_stage=-10
29 get_egs_stage=-10
30 speed_perturb=true
31 dir=exp/chain/tdnn_attend # Note: _sp will get added to this if $speed_perturb == true.
32 decode_iter=
33 decode_nj=50
35 # training options
36 num_epochs=4
37 initial_effective_lrate=0.001
38 final_effective_lrate=0.0001
39 leftmost_questions_truncate=-1
40 max_param_change=2.0
41 final_layer_normalize_target=0.5
42 num_jobs_initial=3
43 num_jobs_final=16
44 minibatch_size=128
45 frames_per_eg=150
46 remove_egs=false
47 common_egs_dir=
48 xent_regularize=0.1
50 test_online_decoding=false # if true, it will run the last decoding stage.
52 # End configuration section.
53 echo "$0 $@" # Print the command line for logging
55 . ./cmd.sh
56 . ./path.sh
57 . ./utils/parse_options.sh
59 if ! cuda-compiled; then
60 cat <<EOF && exit 1
61 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
62 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
63 where "nvcc" is installed.
64 EOF
65 fi
67 # The iVector-extraction and feature-dumping parts are the same as the standard
68 # nnet3 setup, and you can skip them by setting "--stage 8" if you have already
69 # run those things.
71 suffix=
72 if [ "$speed_perturb" == "true" ]; then
73 suffix=_sp
74 fi
76 dir=${dir}${affix:+_$affix}$suffix
77 train_set=train_nodup$suffix
78 ali_dir=exp/tri4_ali_nodup$suffix
79 treedir=exp/chain/tri5_7d_tree$suffix
80 lang=data/lang_chain_2y
83 # if we are using the speed-perturbed data we need to generate
84 # alignments for it.
85 local/nnet3/run_ivector_common.sh --stage $stage \
86 --speed-perturb $speed_perturb \
87 --generate-alignments $speed_perturb || exit 1;
90 if [ $stage -le 9 ]; then
91 # Get the alignments as lattices (gives the LF-MMI training more freedom).
92 # use the same num-jobs as the alignments
93 nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
94 steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
95 data/lang exp/tri4 exp/tri4_lats_nodup$suffix
96 rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
97 fi
100 if [ $stage -le 10 ]; then
101 # Create a version of the lang/ directory that has one state per phone in the
102 # topo file. [note, it really has two states.. the first one is only repeated
103 # once, the second one has zero or more repeats.]
104 rm -rf $lang
105 cp -r data/lang $lang
106 silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
107 nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
108 # Use our special topology... note that later on may have to tune this
109 # topology.
110 steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
111 fi
113 if [ $stage -le 11 ]; then
114 # Build a tree using our new topology. This is the critically different
115 # step compared with other recipes.
116 steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
117 --leftmost-questions-truncate $leftmost_questions_truncate \
118 --context-opts "--context-width=2 --central-position=1" \
119 --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
120 fi
122 if [ $stage -le 12 ]; then
123 echo "$0: creating neural net configs using the xconfig parser";
124 num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
125 learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
127 mkdir -p $dir/configs
128 cat <<EOF > $dir/configs/network.xconfig
129 input dim=100 name=ivector
130 input dim=40 name=input
132 # please note that it is important to have input layer with the name=input
133 # as the layer immediately preceding the fixed-affine-layer to enable
134 # the use of short notation for the descriptor
135 fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
137 # the first splicing is moved before the lda layer, so no splicing here
138 relu-batchnorm-layer name=tdnn1 dim=625
139 relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
140 relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
141 relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
142 relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
143 relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
144 attention-relu-renorm-layer name=attention1 num-heads=15 value-dim=80 key-dim=40 num-left-inputs=5 num-right-inputs=2 time-stride=3
146 ## adding the layers for chain branch
147 relu-batchnorm-layer name=prefinal-chain input=attention1 dim=625 target-rms=0.5
148 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
150 # adding the layers for xent branch
151 # This block prints the configs for a separate output that will be
152 # trained with a cross-entropy objective in the 'chain' models... this
153 # has the effect of regularizing the hidden parts of the model. we use
154 # 0.5 / args.xent_regularize as the learning rate factor- the factor of
155 # 0.5 / args.xent_regularize is suitable as it means the xent
156 # final-layer learns at a rate independent of the regularization
157 # constant; and the 0.5 was tuned so as to make the relative progress
158 # similar in the xent and regular final layers.
159 relu-batchnorm-layer name=prefinal-xent input=attention1 dim=625 target-rms=0.5
160 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
162 EOF
163 steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
164 fi
166 if [ $stage -le 13 ]; then
167 if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
168 utils/create_split_dir.pl \
169 /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
170 fi
172 steps/nnet3/chain/train.py --stage $train_stage \
173 --cmd "$decode_cmd" \
174 --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
175 --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
176 --chain.xent-regularize $xent_regularize \
177 --chain.leaky-hmm-coefficient 0.1 \
178 --chain.l2-regularize 0.00005 \
179 --chain.apply-deriv-weights false \
180 --chain.lm-opts="--num-extra-lm-states=2000" \
181 --egs.dir "$common_egs_dir" \
182 --egs.stage $get_egs_stage \
183 --egs.opts "--frames-overlap-per-eg 0" \
184 --egs.chunk-width $frames_per_eg \
185 --trainer.num-chunk-per-minibatch $minibatch_size \
186 --trainer.frames-per-iter 1500000 \
187 --trainer.num-epochs $num_epochs \
188 --trainer.optimization.num-jobs-initial $num_jobs_initial \
189 --trainer.optimization.num-jobs-final $num_jobs_final \
190 --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
191 --trainer.optimization.final-effective-lrate $final_effective_lrate \
192 --trainer.max-param-change $max_param_change \
193 --cleanup.remove-egs $remove_egs \
194 --feat-dir data/${train_set}_hires \
195 --tree-dir $treedir \
196 --lat-dir exp/tri4_lats_nodup$suffix \
197 --dir $dir || exit 1;
199 fi
201 if [ $stage -le 14 ]; then
202 # Note: it might appear that this $lang directory is mismatched, and it is as
203 # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
204 # the lang directory.
205 utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
206 fi
209 graph_dir=$dir/graph_sw1_tg
210 iter_opts=
211 if [ ! -z $decode_iter ]; then
212 iter_opts=" --iter $decode_iter "
213 fi
214 if [ $stage -le 15 ]; then
215 rm $dir/.error 2>/dev/null || true
216 for decode_set in train_dev eval2000; do
217 (
218 steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
219 --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
220 --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
221 $graph_dir data/${decode_set}_hires \
222 $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
223 if $has_fisher; then
224 steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
225 data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
226 $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
227 fi
228 ) || touch $dir/.error &
229 done
230 wait
231 if [ -f $dir/.error ]; then
232 echo "$0: something went wrong in decoding"
233 exit 1
234 fi
235 fi
237 if $test_online_decoding && [ $stage -le 16 ]; then
238 # note: if the features change (e.g. you add pitch features), you will have to
239 # change the options of the following command line.
240 steps/online/nnet3/prepare_online_decoding.sh \
241 --mfcc-config conf/mfcc_hires.conf \
242 $lang exp/nnet3/extractor $dir ${dir}_online
244 rm $dir/.error 2>/dev/null || true
245 for decode_set in train_dev eval2000; do
246 (
247 # note: we just give it "$decode_set" as it only uses the wav.scp, the
248 # feature type does not matter.
250 steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
251 --acwt 1.0 --post-decode-acwt 10.0 \
252 $graph_dir data/${decode_set}_hires \
253 ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
254 if $has_fisher; then
255 steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
256 data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
257 ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
258 fi
259 ) || touch $dir/.error &
260 done
261 wait
262 if [ -f $dir/.error ]; then
263 echo "$0: something went wrong in decoding"
264 exit 1
265 fi
266 fi
269 exit 0;