egs/wsj/s5/run.sh

   1 #!/bin/bash
   2
   3 stage=0
   4 train=true   # set to false to disable the training-related scripts
   5              # note: you probably only want to set --train false if you
   6              # are using at least --stage 1.
   7 decode=true  # set to false to disable the decoding-related scripts.
   8
   9 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
  10            ## This relates to the queue.
  11 . utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
  12
  13
  14 # This is a shell script, but it's recommended that you run the commands one by
  15 # one by copying and pasting into the shell.
  16
  17 #wsj0=/ais/gobi2/speech/WSJ/csr_?_senn_d?
  18 #wsj1=/ais/gobi2/speech/WSJ/csr_senn_d?
  19
  20 #wsj0=/mnt/matylda2/data/WSJ0
  21 #wsj1=/mnt/matylda2/data/WSJ1
  22
  23 #wsj0=/data/corpora0/LDC93S6B
  24 #wsj1=/data/corpora0/LDC94S13B
  25
  26 wsj0=/export/corpora5/LDC/LDC93S6B
  27 wsj1=/export/corpora5/LDC/LDC94S13B
  28
  29
  30 if [ $stage -le 0 ]; then
  31   # data preparation.
  32   local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
  33
  34   # Sometimes, we have seen WSJ distributions that do not have subdirectories
  35   # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
  36   # wsj0 or wsj1 directories. In such cases, try the following:
  37   #
  38   # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
  39   # local/cstr_wsj_data_prep.sh $corpus
  40   # rm data/local/dict/lexiconp.txt
  41   # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
  42   #
  43   # "nosp" refers to the dictionary before silence probabilities and pronunciation
  44   # probabilities are added.
  45   local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
  46
  47   utils/prepare_lang.sh data/local/dict_nosp \
  48                         "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
  49
  50   local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
  51
  52   # We suggest to run the next three commands in the background,
  53   # as they are not a precondition for the system building and
  54   # most of the tests: these commands build a dictionary
  55   # containing many of the OOVs in the WSJ LM training data,
  56   # and an LM trained directly on that data (i.e. not just
  57   # copying the arpa files from the disks from LDC).
  58   # Caution: the commands below will only work if $decode_cmd
  59   # is setup to use qsub.  Else, just remove the --cmd option.
  60   # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
  61   # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
  62   (
  63     local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
  64       utils/prepare_lang.sh data/local/dict_nosp_larger \
  65                             "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
  66       local/wsj_train_lms.sh --dict-suffix "_nosp" &&
  67       local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
  68   ) &
  69
  70   # Now make MFCC features.
  71   # mfccdir should be some place with a largish disk where you
  72   # want to store MFCC features.
  73
  74   for x in test_eval92 test_eval93 test_dev93 train_si284; do
  75     steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
  76     steps/compute_cmvn_stats.sh data/$x || exit 1;
  77   done
  78
  79   utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
  80
  81   # Now make subset with the shortest 2k utterances from si-84.
  82   utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
  83
  84   # Now make subset with half of the data from si-84.
  85   utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
  86 fi
  87
  88
  89 if [ $stage -le 1 ]; then
  90   # monophone
  91
  92
  93   # Note: the --boost-silence option should probably be omitted by default
  94   # for normal setups.  It doesn't always help. [it's to discourage non-silence
  95   # models from modeling silence.]
  96   if $train; then
  97     steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
  98       data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
  99   fi
 100
 101   if $decode; then
 102     utils/mkgraph.sh data/lang_nosp_test_tgpr exp/mono0a exp/mono0a/graph_nosp_tgpr && \
 103       steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
 104         data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
 105       steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
 106         data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
 107   fi
 108 fi
 109
 110 if [ $stage -le 2 ]; then
 111   # tri1
 112   if $train; then
 113     steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
 114       data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
 115
 116     steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
 117       data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
 118   fi
 119
 120   if $decode; then
 121     utils/mkgraph.sh data/lang_nosp_test_tgpr \
 122       exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
 123
 124     for data in dev93 eval92; do
 125       nspk=$(wc -l <data/test_${data}/spk2utt)
 126       steps/decode.sh --nj $nspk --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
 127         data/test_${data} exp/tri1/decode_nosp_tgpr_${data} || exit 1;
 128
 129       # test various modes of LM rescoring (4 is the default one).
 130       # This is just confirming they're equivalent.
 131       for mode in 1 2 3 4; do
 132         steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
 133           data/lang_nosp_test_{tgpr,tg} data/test_${data} \
 134           exp/tri1/decode_nosp_tgpr_${data} \
 135           exp/tri1/decode_nosp_tgpr_${data}_tg$mode  || exit 1;
 136       done
 137       # later on we'll demonstrate const-arpa LM rescoring, which is now
 138       # the recommended method.
 139     done
 140
 141     ## the following command demonstrates how to get lattices that are
 142     ## "word-aligned" (arcs coincide with words, with boundaries in the right
 143     ## place).
 144     #sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
 145     #steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
 146     #  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
 147     #  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
 148   fi
 149 fi
 150
 151
 152 if [ $stage -le 3 ]; then
 153   # tri2b.  there is no special meaning in the "b"-- it's historical.
 154   if $train; then
 155     steps/align_si.sh --nj 10 --cmd "$train_cmd" \
 156       data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
 157
 158     steps/train_lda_mllt.sh --cmd "$train_cmd" \
 159       --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
 160       data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
 161   fi
 162
 163   if $decode; then
 164     utils/mkgraph.sh data/lang_nosp_test_tgpr \
 165       exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
 166     for data in dev93 eval92; do
 167       nspk=$(wc -l <data/test_${data}/spk2utt)
 168       steps/decode.sh --nj ${nspk} --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
 169         data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} || exit 1;
 170
 171        # compare lattice rescoring with biglm decoding, going from tgpr to tg.
 172       steps/decode_biglm.sh --nj ${nspk} --cmd "$decode_cmd" \
 173         exp/tri2b/graph_nosp_tgpr data/lang_nosp_test_{tgpr,tg}/G.fst \
 174         data/test_${data} exp/tri2b/decode_nosp_tgpr_${data}_tg_biglm
 175
 176        # baseline via LM rescoring of lattices.
 177       steps/lmrescore.sh --cmd "$decode_cmd" \
 178         data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
 179         data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} \
 180         exp/tri2b/decode_nosp_tgpr_${data}_tg || exit 1;
 181
 182       # Demonstrating Minimum Bayes Risk decoding (like Confusion Network decoding):
 183       mkdir exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
 184       cp exp/tri2b/decode_nosp_tgpr_${data}_tg/lat.*.gz \
 185          exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr;
 186       local/score_mbr.sh --cmd "$decode_cmd"  \
 187          data/test_${data}/ data/lang_nosp_test_tgpr/ \
 188          exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
 189     done
 190   fi
 191
 192   # At this point, you could run the example scripts that show how VTLN works.
 193   # We haven't included this in the default recipes.
 194   # local/run_vtln.sh --lang-suffix "_nosp"
 195   # local/run_vtln2.sh --lang-suffix "_nosp"
 196 fi
 197
 198
 199 # local/run_delas.sh trains a delta+delta-delta system.  It's not really recommended or
 200 # necessary, but it does contain a demonstration of the decode_fromlats.sh
 201 # script which isn't used elsewhere.
 202 # local/run_deltas.sh
 203
 204 if [ $stage -le 4 ]; then
 205   # From 2b system, train 3b which is LDA + MLLT + SAT.
 206
 207   # Align tri2b system with all the si284 data.
 208   if $train; then
 209     steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
 210       data/train_si284 data/lang_nosp exp/tri2b exp/tri2b_ali_si284  || exit 1;
 211
 212     steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
 213       data/train_si284 data/lang_nosp exp/tri2b_ali_si284 exp/tri3b || exit 1;
 214   fi
 215
 216   if $decode; then
 217     utils/mkgraph.sh data/lang_nosp_test_tgpr \
 218       exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
 219
 220     # the larger dictionary ("big-dict"/bd) + locally produced LM.
 221     utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
 222       exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
 223
 224     # At this point you could run the command below; this gets
 225     # results that demonstrate the basis-fMLLR adaptation (adaptation
 226     # on small amounts of adaptation data).
 227     # local/run_basis_fmllr.sh --lang-suffix "_nosp"
 228
 229     for data in dev93 eval92; do
 230       nspk=$(wc -l <data/test_${data}/spk2utt)
 231       steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
 232         exp/tri3b/graph_nosp_tgpr data/test_${data} \
 233         exp/tri3b/decode_nosp_tgpr_${data} || exit 1;
 234       steps/lmrescore.sh --cmd "$decode_cmd" \
 235         data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
 236         data/test_${data} exp/tri3b/decode_nosp_{tgpr,tg}_${data} || exit 1
 237
 238       # decode with big dictionary.
 239       steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
 240         exp/tri3b/graph_nosp_bd_tgpr data/test_${data} \
 241         exp/tri3b/decode_nosp_bd_tgpr_${data} || exit 1;
 242
 243       # Example of rescoring with ConstArpaLm.
 244       steps/lmrescore_const_arpa.sh \
 245         --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
 246         data/test_${data} exp/tri3b/decode_nosp_bd_tgpr_${data}{,_fg} || exit 1;
 247     done
 248   fi
 249 fi
 250
 251 if [ $stage -le 5 ]; then
 252   # Estimate pronunciation and silence probabilities.
 253
 254   # Silprob for normal lexicon.
 255   steps/get_prons.sh --cmd "$train_cmd" \
 256     data/train_si284 data/lang_nosp exp/tri3b || exit 1;
 257   utils/dict_dir_add_pronprobs.sh --max-normalize true \
 258     data/local/dict_nosp \
 259     exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
 260     exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
 261
 262   utils/prepare_lang.sh data/local/dict \
 263     "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
 264
 265   for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
 266     mkdir -p data/lang_test_${lm_suffix}
 267     cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
 268     rm -rf data/lang_test_${lm_suffix}/tmp
 269     cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
 270   done
 271
 272   # Silprob for larger ("bd") lexicon.
 273   utils/dict_dir_add_pronprobs.sh --max-normalize true \
 274     data/local/dict_nosp_larger \
 275     exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
 276     exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
 277
 278   utils/prepare_lang.sh data/local/dict_larger \
 279     "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
 280
 281   for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
 282     mkdir -p data/lang_test_bd_${lm_suffix}
 283     cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
 284     rm -rf data/lang_test_bd_${lm_suffix}/tmp
 285     cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
 286   done
 287 fi
 288
 289
 290 if [ $stage -le 6 ]; then
 291   # From 3b system, now using data/lang as the lang directory (we have now added
 292   # pronunciation and silence probabilities), train another SAT system (tri4b).
 293
 294   if $train; then
 295     steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
 296       data/train_si284 data/lang exp/tri3b exp/tri4b || exit 1;
 297   fi
 298
 299   if $decode; then
 300     utils/mkgraph.sh data/lang_test_tgpr \
 301       exp/tri4b exp/tri4b/graph_tgpr || exit 1;
 302     utils/mkgraph.sh data/lang_test_bd_tgpr \
 303       exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
 304
 305     for data in dev93 eval92; do
 306       nspk=$(wc -l <data/test_${data}/spk2utt)
 307       steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
 308         exp/tri4b/graph_tgpr data/test_${data} \
 309         exp/tri4b/decode_tgpr_${data} || exit 1;
 310       steps/lmrescore.sh --cmd "$decode_cmd" \
 311         data/lang_test_tgpr data/lang_test_tg \
 312         data/test_${data} exp/tri4b/decode_{tgpr,tg}_${data} || exit 1
 313
 314       steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
 315         exp/tri4b/graph_bd_tgpr data/test_${data} \
 316         exp/tri4b/decode_bd_tgpr_${data} || exit 1;
 317       steps/lmrescore_const_arpa.sh \
 318         --cmd "$decode_cmd" data/lang_test_bd_{tgpr,fgconst} \
 319         data/test_${data} exp/tri4b/decode_bd_tgpr_${data}{,_fg} || exit 1;
 320     done
 321   fi
 322 fi
 323
 324
 325 exit 0;
 326
 327 ### Caution: the parts of the script below this statement are not run by default.
 328 ###
 329
 330
 331 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 332 # all the data).  Use 30 jobs.
 333 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
 334   data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
 335 local/run_mmi_tri4b.sh
 336
 337 # These demonstrate how to build a sytem usable for online-decoding with the nnet2 setup.
 338 # (see local/run_nnet2.sh for other, non-online nnet2 setups).
 339 local/online/run_nnet2.sh
 340 local/online/run_nnet2_baseline.sh
 341 local/online/run_nnet2_discriminative.sh
 342
 343 # Demonstration of RNNLM rescoring on TDNN models. We comment this out by
 344 # default.
 345 # local/run_rnnlms.sh
 346
 347
 348 #local/run_nnet2.sh
 349
 350 # You probably want to run the sgmm2 recipe as it's generally a bit better:
 351 local/run_sgmm2.sh
 352
 353 # We demonstrate MAP adaptation of GMMs to gender-dependent systems here.  This also serves
 354 # as a generic way to demonstrate MAP adaptation to different domains.
 355 # local/run_gender_dep.sh
 356
 357 # You probably want to run the hybrid recipe as it is complementary:
 358 local/nnet/run_dnn.sh
 359
 360 # The following demonstrate how to re-segment long audios.
 361 # local/run_segmentation_long_utts.sh
 362
 363 # The next two commands show how to train a bottleneck network based on the nnet2 setup,
 364 # and build an SGMM system on top of it.
 365 #local/run_bnf.sh
 366 #local/run_bnf_sgmm.sh
 367
 368 # Getting results [see RESULTS file]
 369 # for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
 370
 371
 372 # KWS setup. We leave it commented out by default
 373
 374 # $duration is the length of the search collection, in seconds
 375 #duration=`feat-to-len scp:data/test_eval92/feats.scp  ark,t:- | awk '{x+=$2} END{print x/100;}'`
 376 #local/generate_example_kws.sh data/test_eval92/ data/kws/
 377 #local/kws_data_prep.sh data/lang_test_bd_tgpr/ data/test_eval92/ data/kws/
 378 #
 379 #steps/make_index.sh --cmd "$decode_cmd" --acwt 0.1 \
 380 #  data/kws/ data/lang_test_bd_tgpr/ \
 381 #  exp/tri4b/decode_bd_tgpr_eval92/ \
 382 #  exp/tri4b/decode_bd_tgpr_eval92/kws
 383 #
 384 #steps/search_index.sh --cmd "$decode_cmd" \
 385 #  data/kws \
 386 #  exp/tri4b/decode_bd_tgpr_eval92/kws
 387 #
 388 # If you want to provide the start time for each utterance, you can use the --segments
 389 # option. In WSJ each file is an utterance, so we don't have to set the start time.
 390 #cat exp/tri4b/decode_bd_tgpr_eval92/kws/result.* | \
 391 #  utils/write_kwslist.pl --flen=0.01 --duration=$duration \
 392 #  --normalize=true --map-utter=data/kws/utter_map \
 393 #  - exp/tri4b/decode_bd_tgpr_eval92/kws/kwslist.xml
 394
 395 # # A couple of nnet3 recipes:
 396 # local/nnet3/run_tdnn_baseline.sh  # designed for exact comparison with nnet2 recipe
 397 # local/nnet3/run_tdnn.sh  # better absolute results
 398 # local/nnet3/run_lstm.sh  # lstm recipe
 399 # bidirectional lstm recipe
 400 # local/nnet3/run_lstm.sh --affix bidirectional \
 401 #                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
 402 #                         --label-delay 0 \
 403 #                         --cell-dim 640 \
 404 #                         --recurrent-projection-dim 128 \
 405 #                         --non-recurrent-projection-dim 128 \
 406 #                         --chunk-left-context 40 \
 407 #                         --chunk-right-context 40