summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: c210b44)
raw | patch | inline | side by side (parent: c210b44)
author | Gaurav Kumar <gaurav.bison@gmail.com> | |
Fri, 26 May 2017 20:25:14 +0000 (16:25 -0400) | ||
committer | Daniel Povey <dpovey@gmail.com> | |
Fri, 26 May 2017 20:25:14 +0000 (16:25 -0400) |
diff --git a/egs/fisher_callhome_spanish/s5/local/callhome_create_splits.sh b/egs/fisher_callhome_spanish/s5/local/callhome_create_splits.sh
index 4f5d54903365511adb84a614c2ea0dbb2857324a..07814da46a9bced72a5633cd2a59298545d85367 100755 (executable)
utils/fix_data_dir.sh $data_dir/$dirName
utils/validate_data_dir.sh $data_dir/$dirName
- rm $data_dir/$dirName/*.tmp
done
diff --git a/egs/fisher_callhome_spanish/s5/local/create_splits.sh b/egs/fisher_callhome_spanish/s5/local/create_splits.sh
index 6423184fabf0291f2f1bddfb78f73956c9842a5a..8a60dc9d422d18912786ac211ef63d0f69ab5baf 100755 (executable)
utils/fix_data_dir.sh $data_dir/$split
utils/validate_data_dir.sh $data_dir/$split
- rm $data_dir/$split/*.tmp
done
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index 8fe80b4678453e5985b486c8e4b147788e4bd6c2..441e547958ecbe7daa5b71f763880d2c3769df1b 100755 (executable)
exit 1;
fi
-if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
+#if [ ! -d links/LDC2010S01/DISC1/data/speech -o ! -d links/LDC2010S01/DISC2/data/speech ];
+if [ ! -d links/LDC2010S01/data/speech ];
then
- echo "Disc 1 and 2 directories missing or not properly organised within the speech data dir"
- echo "Typical format is LDC2010S01/DISC?/data/speech"
+ echo "Speech directories missing or not properly organised within the speech data dir"
+ echo "Typical format is LDC2010S01/data/speech"
exit 1;
fi
#Check the transcripts directories as well to see if they exist
-if [ ! -d links/LDC2010T04/data/transcripts ];
+if [ ! -d links/LDC2010T04/fisher_spa_tr/data/transcripts ];
then
echo "Transcript directories missing or not properly organised"
- echo "Typical format is LDC2010T04/data/transcripts"
+ echo "Typical format is LDC2010T04/fisher_spa_tr/data/transcripts"
exit 1;
fi
-speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
-speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
-transcripts=$dir/links/LDC2010T04/data/transcripts
+#speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
+#speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
+speech=$dir/links/LDC2010S01/data/speech
+transcripts=$dir/links/LDC2010T04/fisher_spa_tr/data/transcripts
-fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
-fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
+#fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
+#fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
+fcount_s=`find ${speech} -iname '*.sph' | wc -l`
fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
#TODO:it seems like not all speech files have transcripts
#Now check if we got all the files that we needed
-if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
+#if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
+if [ $fcount_s != 819 -o $fcount_t != 819 ];
then
echo "Incorrect number of files in the data directories"
- echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"
+ echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively (Total = 819)"
echo "The transcripts should contain 819 files"
exit 1;
fi
#Gather all the speech files together to create a file list
#TODO: Train and test split might be required
(
- find $speech_d1 -iname '*.sph';
- find $speech_d2 -iname '*.sph';
+ #find $speech_d1 -iname '*.sph';
+ #find $speech_d2 -iname '*.sph';
+ find $speech -iname '*.sph';
) > $tmpdir/train_sph.flist
#Get all the transcripts in one place
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_train_lms.sh b/egs/fisher_callhome_spanish/s5/local/fsp_train_lms.sh
index 671cf2a0419b4faca59935affd0162575d358062..24eeac8d0f7719195b1f3a9d17f65cb3567a98e2 100755 (executable)
else
echo Downloading and installing the kaldi_lm tools
if [ ! -f kaldi_lm.tar.gz ]; then
- wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
+ wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
fi
tar -xvzf kaldi_lm.tar.gz || exit 1;
cd kaldi_lm
index ad650cd390e909622611a9f013deeb3b081bdaaf..e4661ce06faffcecb1fba0dd81f9111989136c4a 100755 (executable)
mfccdir=`pwd`/mfcc
set -e
+stage=1
+
# call the next line with the directory where the Spanish Fisher data is
# (the values below are just an example). This should contain
# subdirectories named as follows:
# DISC1 DIC2
-sfisher_speech=/home/mpost/data/LDC/LDC2010S01
-sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04
-spanish_lexicon=/export/corpora/LDC/LDC96L16
+sfisher_speech=/export/a16/gkumar/corpora/LDC2010S01
+sfisher_transcripts=/export/a16/gkumar/corpora/LDC2010T04
+spanish_lexicon=/export/a16/gkumar/corpora/LDC96L16
split=local/splits/split_fisher
-callhome_speech=/export/corpora/LDC/LDC96S35
-callhome_transcripts=/export/corpora/LDC/LDC96T17
-split=local/splits/split_callhome
-
-local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
-
-local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
-
-local/fsp_prepare_dict.sh $spanish_lexicon
-
-# Rewrite ----------------------------- This section is no longer needed----
-# At this point, it might make sense to use a bigger lexicon
-# The one I will use is derived from this exercise (spanish fisher) and
-# the LDC spanish lexicon along with the most frequent words derived from the
-# gigaword corpus such that the total number of entries in the lexicon
-# are 64k
-
-# To generate the merged lexicon, run
-# /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py
-# you might have to set the locations of the three lexicons within this
-# file. Note that the LDC rule base phoneme generator works only from its
-# own directory. So the merged lexicon is actually created in
-# /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k
-# This can be easily fixed and will be done. #TODO
-# Also run the clean lexicon script to take care of non stressable vowels
-
-# First make a copy of the old lexicon
-#mv data/local/dict/lexicon.txt data/local/dict/lexicon.txt.bak
-#cp /export/a04/gkumar/corpora/gigaword/bin/clean-merged-lexicon data/local/dict/lexicon.txt
-# ------------ Rewrite -----------------------
-
-# Added c,j, v to the non silences phones manually
-utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-
-
-# Make sure that you do not use your test and your dev sets to train the LM
-# Some form of cross validation is possible where you decode your dev/set based on an
-# LM that is trained on everything but that that conversation
-# When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-# to get the numbers. Depending on your needs, you might have to change the size of
-# the splits within that file. The default paritions are based on the Kaldi + Joshua
-# requirements which means that I have very large dev and test sets
-local/fsp_train_lms.sh $split
-local/fsp_create_test_lang.sh
-
-utils/fix_data_dir.sh data/local/data/train_all
-
-steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
-
-utils/fix_data_dir.sh data/local/data/train_all
-utils/validate_data_dir.sh data/local/data/train_all
-
-cp -r data/local/data/train_all data/train_all
-
-# For the CALLHOME corpus
-utils/fix_data_dir.sh data/local/data/callhome_train_all
-
-steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1;
-
-utils/fix_data_dir.sh data/local/data/callhome_train_all
-utils/validate_data_dir.sh data/local/data/callhome_train_all
-
-cp -r data/local/data/callhome_train_all data/callhome_train_all
-
-# Creating data partitions for the pipeline
-# We need datasets for both the ASR and SMT system
-# We have 257455 utterances left, so the partitions are roughly as follows
-# ASR Train : 100k utterances
-# ASR Tune : 17455 utterances
-# ASR Eval : 20k utterances
-# MT Train : 100k utterances
-# MT Tune : Same as the ASR eval set (Use the lattices from here)
-# MT Eval : 20k utterances
-# The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
-# As noted above, the LM has not been trained on the dev and the test sets.
-#utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
-#utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
-#utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
-#utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
-#utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
-#utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
-#utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
-#rm -r data/dev_and_test
-#rm -r data/asr_dev_and_test
-#rm -r data/mt_train_and_test
-
-local/create_splits.sh $split
-local/callhome_create_splits.sh $split_callhome
-
-# Now compute CMVN stats for the train, dev and test subsets
-steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
-steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
-steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir
-#steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir
-#steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir
-
-#n=$[`cat data/train_all/segments | wc -l` - 158126]
-#utils/subset_data_dir.sh --last data/train_all $n data/train
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
-
-steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir
-steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir
-steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir
-
-# Again from Dan's recipe : Reduced monophone training data
-# Now-- there are 1.6 million utterances, and we want to start the monophone training
-# on relatively short utterances (easier to align), but not only the very shortest
-# ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
-# utterances from those.
-
-utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
-utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k
-utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
-utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
-utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
+callhome_speech=/export/a16/gkumar/corpora/LDC96S35
+callhome_transcripts=/export/a16/gkumar/corpora/LDC96T17
+split_callhome=local/splits/split_callhome
+
+if [ $stage -lt 1 ]; then
+ local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
+
+ local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
+
+ # The lexicon is created using the LDC spanish lexicon, the words from the
+ # fisher spanish corpus. Additional (most frequent) words are added from the
+ # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
+ # wordlist is downloaded if it is not available.
+ local/fsp_prepare_dict.sh $spanish_lexicon
+
+ # Added c,j, v to the non silences phones manually
+ utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+ # Make sure that you do not use your test and your dev sets to train the LM
+ # Some form of cross validation is possible where you decode your dev/set based on an
+ # LM that is trained on everything but that that conversation
+ # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
+ # to get the numbers. Depending on your needs, you might have to change the size of
+ # the splits within that file. The default paritions are based on the Kaldi + Joshua
+ # requirements which means that I have very large dev and test sets
+ local/fsp_train_lms.sh $split
+ local/fsp_create_test_lang.sh
+
+ utils/fix_data_dir.sh data/local/data/train_all
+
+ steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
+
+ utils/fix_data_dir.sh data/local/data/train_all
+ utils/validate_data_dir.sh data/local/data/train_all
+
+ cp -r data/local/data/train_all data/train_all
+
+ # For the CALLHOME corpus
+ utils/fix_data_dir.sh data/local/data/callhome_train_all
+
+ steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1;
+
+ utils/fix_data_dir.sh data/local/data/callhome_train_all
+ utils/validate_data_dir.sh data/local/data/callhome_train_all
+
+ cp -r data/local/data/callhome_train_all data/callhome_train_all
+
+ # Creating data partitions for the pipeline
+ # We need datasets for both the ASR and SMT system
+ # We have 257455 utterances left, so the partitions are roughly as follows
+ # ASR Train : 100k utterances
+ # ASR Tune : 17455 utterances
+ # ASR Eval : 20k utterances
+ # MT Train : 100k utterances
+ # MT Tune : Same as the ASR eval set (Use the lattices from here)
+ # MT Eval : 20k utterances
+ # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
+ # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
+ # As noted above, the LM has not been trained on the dev and the test sets.
+ #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
+ #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
+ #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
+ #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
+ #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
+ #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
+ #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
+ #rm -r data/dev_and_test
+ #rm -r data/asr_dev_and_test
+ #rm -r data/mt_train_and_test
+
+ local/create_splits.sh $split
+ local/callhome_create_splits.sh $split_callhome
+fi
+
+if [ $stage -lt 2 ]; then
+ # Now compute CMVN stats for the train, dev and test subsets
+ steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
+ steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
+ steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir
+ #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir
+ #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir
+
+ #n=$[`cat data/train_all/segments | wc -l` - 158126]
+ #utils/subset_data_dir.sh --last data/train_all $n data/train
+ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
+
+ steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir
+ steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir
+ steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir
+
+ # Again from Dan's recipe : Reduced monophone training data
+ # Now-- there are 1.6 million utterances, and we want to start the monophone training
+ # on relatively short utterances (easier to align), but not only the very shortest
+ # ones (mostly uh-huh). So take the 100k shortest ones, and then take 10k random
+ # utterances from those.
+
+ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
+ utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k
+ local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
+ utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
+ utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
+fi
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
data/train_10k_nodup data/lang exp/mono0a
(
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
+steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
--config conf/decode.config --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
exp/tri5a/graph data/dev exp/tri5a/decode_dev
utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
- --parallel-opts "--num-threads 16" --cmd "queue.pl --mem 2G")
+ --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 --mem 2G")
dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
- --parallel-opts "--gpu 1" --cmd "queue.pl --mem 2G")
+ --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 --mem 2G")
steps/nnet2/train_pnorm_ensemble.sh \
--mix-up 5000 --initial-learning-rate 0.008 --final-learning-rate 0.0008\
data/train data/lang exp/tri5a_ali exp/tri6a_dnn
(
- steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
+ steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
--scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
) &
wait