1 #!/bin/bash
3 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
4 ## This relates to the queue.
6 # This is a shell script, but it's recommended that you run the commands one by
7 # one by copying and pasting into the shell.
9 #wsj0=/ais/gobi2/speech/WSJ/csr_?_senn_d?
10 #wsj1=/ais/gobi2/speech/WSJ/csr_senn_d?
12 #wsj0=/mnt/matylda2/data/WSJ0
13 #wsj1=/mnt/matylda2/data/WSJ1
15 #wsj0=/data/corpora0/LDC93S6B
16 #wsj1=/data/corpora0/LDC94S13B
18 wsj0=/export/corpora5/LDC/LDC93S6B
19 wsj1=/export/corpora5/LDC/LDC94S13B
21 local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.? || exit 1;
23 # Sometimes, we have seen WSJ distributions that do not have subdirectories
24 # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
25 # wsj0 or wsj1 directories. In such cases, try the following:
26 #
27 # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
28 # local/cstr_wsj_data_prep.sh $corpus
29 # rm data/local/dict/lexiconp.txt
30 # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
32 local/wsj_prepare_dict.sh || exit 1;
34 utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
36 local/wsj_format_data.sh || exit 1;
38 # We suggest to run the next three commands in the background,
39 # as they are not a precondition for the system building and
40 # most of the tests: these commands build a dictionary
41 # containing many of the OOVs in the WSJ LM training data,
42 # and an LM trained directly on that data (i.e. not just
43 # copying the arpa files from the disks from LDC).
44 # Caution: the commands below will only work if $decode_cmd
45 # is setup to use qsub. Else, just remove the --cmd option.
46 # NOTE: If you have a setup corresponding to the cstr_wsj_data_prep.sh style,
47 # use local/cstr_wsj_extend_dict.sh $corpus/wsj1/doc/ instead.
49 # Note: I am commenting out the RNNLM-building commands below. They take up a lot
50 # of CPU time and are not really part of the "main recipe."
51 # Be careful: appending things like "-l mem_free=10G" to $decode_cmd
52 # won't always work, it depends what $decode_cmd is.
53 (
54 local/wsj_extend_dict.sh $wsj1/13-32.1 && \
55 utils/prepare_lang.sh data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger data/lang_bd && \
56 local/wsj_train_lms.sh &&
57 local/wsj_format_local_lms.sh # &&
58 #
59 # ( local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=10G" data/local/rnnlm.h30.voc10k &
60 # sleep 20; # wait till tools compiled.
61 # local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=12G" \
62 # --hidden 100 --nwords 20000 --class 350 --direct 1500 data/local/rnnlm.h100.voc20k &
63 # local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=14G" \
64 # --hidden 200 --nwords 30000 --class 350 --direct 1500 data/local/rnnlm.h200.voc30k &
65 # local/wsj_train_rnnlms.sh --cmd "$decode_cmd -l mem_free=16G" \
66 # --hidden 300 --nwords 40000 --class 400 --direct 2000 data/local/rnnlm.h300.voc40k &
67 # )
68 false && \ # Comment this out to train RNNLM-HS
69 (
70 num_threads_rnnlm=8
71 local/wsj_train_rnnlms.sh --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
72 --cmd "$decode_cmd -l mem_free=1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
73 --hidden 30 --nwords 10000 --direct 1000 data/local/rnnlm-hs.h30.voc10k
74 local/wsj_train_rnnlms.sh --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
75 --cmd "$decode_cmd -l mem_free=1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
76 --hidden 100 --nwords 20000 --direct 1500 data/local/rnnlm-hs.h100.voc20k
77 local/wsj_train_rnnlms.sh --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
78 --cmd "$decode_cmd -l mem_free=1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
79 --hidden 300 --nwords 30000 --direct 1500 data/local/rnnlm-hs.h300.voc30k
80 local/wsj_train_rnnlms.sh --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
81 --cmd "$decode_cmd -l mem_free=1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
82 --hidden 400 --nwords 40000 --direct 2000 data/local/rnnlm-hs.h400.voc40k
83 )
84 ) &
86 # Now make MFCC features.
87 # mfccdir should be some place with a largish disk where you
88 # want to store MFCC features.
89 mfccdir=mfcc
90 for x in test_eval92 test_eval93 test_dev93 train_si284; do
91 steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \
92 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
93 steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
94 done
97 utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
99 # Now make subset with the shortest 2k utterances from si-84.
100 utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
102 # Now make subset with half of the data from si-84.
103 utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
106 # Note: the --boost-silence option should probably be omitted by default
107 # for normal setups. It doesn't always help. [it's to discourage non-silence
108 # models from modeling silence.]
109 steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
110 data/train_si84_2kshort data/lang exp/mono0a || exit 1;
113 (
114 utils/mkgraph.sh --mono data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr && \
115 steps/decode.sh --nj 10 --cmd "$decode_cmd" \
116 exp/mono0a/graph_tgpr data/test_dev93 exp/mono0a/decode_tgpr_dev93 && \
117 steps/decode.sh --nj 8 --cmd "$decode_cmd" \
118 exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92
119 ) &
121 steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
122 data/train_si84_half data/lang exp/mono0a exp/mono0a_ali || exit 1;
124 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
125 2000 10000 data/train_si84_half data/lang exp/mono0a_ali exp/tri1 || exit 1;
127 while [ ! -f data/lang_test_tgpr/tmp/LG.fst ] || \
128 [ -z data/lang_test_tgpr/tmp/LG.fst ]; do
129 sleep 20;
130 done
131 sleep 30;
132 # or the mono mkgraph.sh might be writing
133 # data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
135 utils/mkgraph.sh data/lang_test_tgpr exp/tri1 exp/tri1/graph_tgpr || exit 1;
138 steps/decode.sh --nj 10 --cmd "$decode_cmd" \
139 exp/tri1/graph_tgpr data/test_dev93 exp/tri1/decode_tgpr_dev93 || exit 1;
140 steps/decode.sh --nj 8 --cmd "$decode_cmd" \
141 exp/tri1/graph_tgpr data/test_eval92 exp/tri1/decode_tgpr_eval92 || exit 1;
144 # test various modes of LM rescoring (4 is the default one).
145 # This is just confirming they're equivalent.
146 for mode in 1 2 3 4; do
147 steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
148 data/test_dev93 exp/tri1/decode_tgpr_dev93 exp/tri1/decode_tgpr_dev93_tg$mode || exit 1;
149 done
151 # demonstrate how to get lattices that are "word-aligned" (arcs coincide with
152 # words, with boundaries in the right place).
153 sil_label=`grep '!SIL' data/lang_test_tgpr/words.txt | awk '{print $2}'`
154 steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
155 data/lang_test_tgpr exp/tri1/decode_tgpr_dev93 exp/tri1/decode_tgpr_dev93_aligned || exit 1;
157 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
158 data/train_si84 data/lang exp/tri1 exp/tri1_ali_si84 || exit 1;
160 # Train tri2a, which is deltas + delta-deltas, on si84 data.
161 steps/train_deltas.sh --cmd "$train_cmd" \
162 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2a || exit 1;
164 utils/mkgraph.sh data/lang_test_tgpr exp/tri2a exp/tri2a/graph_tgpr || exit 1;
166 steps/decode.sh --nj 10 --cmd "$decode_cmd" \
167 exp/tri2a/graph_tgpr data/test_dev93 exp/tri2a/decode_tgpr_dev93 || exit 1;
168 steps/decode.sh --nj 8 --cmd "$decode_cmd" \
169 exp/tri2a/graph_tgpr data/test_eval92 exp/tri2a/decode_tgpr_eval92 || exit 1;
171 utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
172 steps/decode.sh --nj 8 --cmd "$decode_cmd" \
173 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k || exit 1;
176 steps/train_lda_mllt.sh --cmd "$train_cmd" \
177 --splice-opts "--left-context=3 --right-context=3" \
178 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b || exit 1;
180 utils/mkgraph.sh data/lang_test_tgpr exp/tri2b exp/tri2b/graph_tgpr || exit 1;
181 steps/decode.sh --nj 10 --cmd "$decode_cmd" \
182 exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b/decode_tgpr_dev93 || exit 1;
183 steps/decode.sh --nj 8 --cmd "$decode_cmd" \
184 exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b/decode_tgpr_eval92 || exit 1;
186 # At this point, you could run the example scripts that show how VTLN works.
187 # We haven't included this in the default recipes yet.
188 # local/run_vtln.sh
189 # local/run_vtln2.sh
191 # Now, with dev93, compare lattice rescoring with biglm decoding,
192 # going from tgpr to tg. Note: results are not the same, even though they should
193 # be, and I believe this is due to the beams not being wide enough. The pruning
194 # seems to be a bit too narrow in the current scripts (got at least 0.7% absolute
195 # improvement from loosening beams from their current values).
197 steps/decode_biglm.sh --nj 10 --cmd "$decode_cmd" \
198 exp/tri2b/graph_tgpr data/lang_test_{tgpr,tg}/G.fst \
199 data/test_dev93 exp/tri2b/decode_tgpr_dev93_tg_biglm
201 # baseline via LM rescoring of lattices.
202 steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr/ data/lang_test_tg/ \
203 data/test_dev93 exp/tri2b/decode_tgpr_dev93 exp/tri2b/decode_tgpr_dev93_tg || exit 1;
205 # Trying Minimum Bayes Risk decoding (like Confusion Network decoding):
206 mkdir exp/tri2b/decode_tgpr_dev93_tg_mbr
207 cp exp/tri2b/decode_tgpr_dev93_tg/lat.*.gz exp/tri2b/decode_tgpr_dev93_tg_mbr
208 local/score_mbr.sh --cmd "$decode_cmd" \
209 data/test_dev93/ data/lang_test_tgpr/ exp/tri2b/decode_tgpr_dev93_tg_mbr
211 steps/decode_fromlats.sh --cmd "$decode_cmd" \
212 data/test_dev93 data/lang_test_tgpr exp/tri2b/decode_tgpr_dev93 \
213 exp/tri2a/decode_tgpr_dev93_fromlats || exit 1
216 # Align tri2b system with si84 data.
217 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
218 --use-graphs true data/train_si84 data/lang exp/tri2b exp/tri2b_ali_si84 || exit 1;
221 local/run_mmi_tri2b.sh
224 # From 2b system, train 3b which is LDA + MLLT + SAT.
225 steps/train_sat.sh --cmd "$train_cmd" \
226 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b || exit 1;
227 utils/mkgraph.sh data/lang_test_tgpr exp/tri3b exp/tri3b/graph_tgpr || exit 1;
228 steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
229 exp/tri3b/graph_tgpr data/test_dev93 exp/tri3b/decode_tgpr_dev93 || exit 1;
230 steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
231 exp/tri3b/graph_tgpr data/test_eval92 exp/tri3b/decode_tgpr_eval92 || exit 1;
233 # At this point you could run the command below; this gets
234 # results that demonstrate the basis-fMLLR adaptation (adaptation
235 # on small amounts of adaptation data).
236 local/run_basis_fmllr.sh
238 steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr data/lang_test_tg \
239 data/test_dev93 exp/tri3b/decode_tgpr_dev93 exp/tri3b/decode_tgpr_dev93_tg || exit 1;
240 steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr data/lang_test_tg \
241 data/test_eval92 exp/tri3b/decode_tgpr_eval92 exp/tri3b/decode_tgpr_eval92_tg || exit 1;
244 # Trying the larger dictionary ("big-dict"/bd) + locally produced LM.
245 utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri3b exp/tri3b/graph_bd_tgpr || exit 1;
247 steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
248 exp/tri3b/graph_bd_tgpr data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 || exit 1;
249 steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
250 exp/tri3b/graph_bd_tgpr data/test_dev93 exp/tri3b/decode_bd_tgpr_dev93 || exit 1;
252 # Example of rescoring with ConstArpaLm.
253 steps/lmrescore_const_arpa.sh \
254 --cmd "$decode_cmd" data/lang_test_bd_{tgpr,fgconst} \
255 data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92{,_fgconst} || exit 1;
257 steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
258 data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 exp/tri3b/decode_bd_tgpr_eval92_fg \
259 || exit 1;
260 steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_tg \
261 data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 exp/tri3b/decode_bd_tgpr_eval92_tg \
262 || exit 1;
264 # The command below is commented out as we commented out the steps above
265 # that build the RNNLMs, so it would fail.
266 # local/run_rnnlms_tri3b.sh
268 # The command below is commented out as we commented out the steps above
269 # that build the RNNLMs (HS version), so it would fail.
270 # wait; local/run_rnnlm-hs_tri3b.sh
272 # The following two steps, which are a kind of side-branch, try mixing up
273 ( # from the 3b system. This is to demonstrate that script.
274 steps/mixup.sh --cmd "$train_cmd" \
275 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k || exit 1;
276 steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
277 exp/tri3b/graph_tgpr data/test_dev93 exp/tri3b_20k/decode_tgpr_dev93 || exit 1;
278 )
281 # From 3b system, align all si284 data.
282 steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
283 data/train_si284 data/lang exp/tri3b exp/tri3b_ali_si284 || exit 1;
286 # From 3b system, train another SAT system (tri4a) with all the si284 data.
288 steps/train_sat.sh --cmd "$train_cmd" \
289 4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4a || exit 1;
290 (
291 utils/mkgraph.sh data/lang_test_tgpr exp/tri4a exp/tri4a/graph_tgpr || exit 1;
292 steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
293 exp/tri4a/graph_tgpr data/test_dev93 exp/tri4a/decode_tgpr_dev93 || exit 1;
294 steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
295 exp/tri4a/graph_tgpr data/test_eval92 exp/tri4a/decode_tgpr_eval92 || exit 1;
296 ) &
299 # This step is just to demonstrate the train_quick.sh script, in which we
300 # initialize the GMMs from the old system's GMMs.
301 steps/train_quick.sh --cmd "$train_cmd" \
302 4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4b || exit 1;
304 (
305 utils/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr || exit 1;
306 steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
307 exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93 || exit 1;
308 steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
309 exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92 || exit 1;
311 utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
312 steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
313 exp/tri4b/graph_bd_tgpr data/test_dev93 exp/tri4b/decode_bd_tgpr_dev93 || exit 1;
314 steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
315 exp/tri4b/graph_bd_tgpr data/test_eval92 exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
316 ) &
319 ( # run decoding with larger dictionary and pron-probs. Need to get dict with
320 # pron-probs first. [This seems to help by about 0.1% absolute in general.]
321 cp -rT data/local/dict_larger data/local/dict_larger_pp
322 rm -r data/local/dict_larger_pp/{b,f,*.gz,lexicon.txt}
323 steps/get_lexicon_probs.sh data/train_si284 data/lang exp/tri4b data/local/dict_larger/lexicon.txt \
324 exp/tri4b_lexprobs data/local/dict_larger_pp/lexiconp.txt || exit 1;
325 utils/prepare_lang.sh --share-silence-phones true \
326 data/local/dict_larger_pp "<SPOKEN_NOISE>" data/dict_larger/tmp data/lang_bd_pp
327 cmp data/lang_bd/words.txt data/lang_bd_pp/words.txt || exit 1;
328 for suffix in tg tgpr fg; do
329 cp -rT data/lang_bd_pp data/lang_test_bd_pp_${suffix}
330 cp data/lang_test_bd_${suffix}/G.fst data/lang_test_bd_pp_${suffix}/G.fst || exit 1;
331 done
332 utils/mkgraph.sh data/lang_test_bd_pp_tgpr exp/tri4b exp/tri4b/graph_bd_pp_tgpr || exit 1;
333 steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
334 exp/tri4b/graph_bd_pp_tgpr data/test_dev93 exp/tri4b/decode_bd_pp_tgpr_dev93
335 steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
336 exp/tri4b/graph_bd_pp_tgpr data/test_eval92 exp/tri4b/decode_bd_pp_tgpr_eval92
337 )
340 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
341 # all the data). Use 30 jobs.
342 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
343 data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
345 # These demonstrate how to build a sytem usable for online-decoding with the nnet2 setup.
346 # (see local/run_nnet2.sh for other, non-online nnet2 setups).
347 local/online/run_nnet2.sh
348 local/online/run_nnet2_baseline.sh
349 local/online/run_nnet2_discriminative.sh
351 local/run_mmi_tri4b.sh
353 #local/run_nnet2.sh
355 ## Segregated some SGMM builds into a separate file.
356 #local/run_sgmm.sh
358 # You probably want to run the sgmm2 recipe as it's generally a bit better:
359 local/run_sgmm2.sh
361 # We demonstrate MAP adaptation of GMMs to gender-dependent systems here. This also serves
362 # as a generic way to demonstrate MAP adaptation to different domains.
363 # local/run_gender_dep.sh
365 # You probably want to run the hybrid recipe as it is complementary:
366 local/run_dnn.sh
368 # The next two commands show how to train a bottleneck network based on the nnet2 setup,
369 # and build an SGMM system on top of it.
370 #local/run_bnf.sh
371 #local/run_bnf_sgmm.sh
374 # You probably want to try KL-HMM
375 #local/run_kl_hmm.sh
377 # Getting results [see RESULTS file]
378 # for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
381 # KWS setup. We leave it commented out by default
383 # $duration is the length of the search collection, in seconds
384 #duration=`feat-to-len scp:data/test_eval92/feats.scp ark,t:- | awk '{x+=$2} END{print x/100;}'`
385 #local/generate_example_kws.sh data/test_eval92/ data/kws/
386 #local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/
387 #
388 #steps/make_index.sh --cmd "$decode_cmd" --acwt 0.1 \
389 # data/kws/ data/lang_test_bd_tgpr/ \
390 # exp/tri4b/decode_bd_tgpr_eval92/ \
391 # exp/tri4b/decode_bd_tgpr_eval92/kws
392 #
393 #steps/search_index.sh --cmd "$decode_cmd" \
394 # data/kws \
395 # exp/tri4b/decode_bd_tgpr_eval92/kws
396 #
397 # If you want to provide the start time for each utterance, you can use the --segments
398 # option. In WSJ each file is an utterance, so we don't have to set the start time.
399 #cat exp/tri4b/decode_bd_tgpr_eval92/kws/result.* | \
400 # utils/write_kwslist.pl --flen=0.01 --duration=$duration \
401 # --normalize=true --map-utter=data/kws/utter_map \
402 # - exp/tri4b/decode_bd_tgpr_eval92/kws/kwslist.xml
404 # # forward-backward decoding example [way to speed up decoding by decoding forward
405 # # and backward in time]
406 # local/run_fwdbwd.sh