1 #!/bin/bash
3 # Copyright 2017 Johns Hopkins University (Author: Daniel Povey)
4 # 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
5 # 2017 David Snyder
6 # Apache 2.0
7 #
8 # This script dumps training examples (egs) for multiclass xvector training.
9 # These egs consist of a data chunk and a zero-based speaker label.
10 # Each archive of egs has, in general, a different input chunk-size.
11 # We don't mix together different lengths in the same archive, because it
12 # would require us to repeatedly run the compilation process within the same
13 # training job.
14 #
15 # This script, which will generally be called from other neural net training
16 # scripts, extracts the training examples used to train the neural net (and
17 # also the validation examples used for diagnostics), and puts them in
18 # separate archives.
21 # Begin configuration section.
22 cmd=run.pl
23 # each archive has data-chunks off length randomly chosen between
24 # $min_frames_per_eg and $max_frames_per_eg.
25 min_frames_per_chunk=50
26 max_frames_per_chunk=300
27 frames_per_iter=10000000 # target number of frames per archive.
29 frames_per_iter_diagnostic=100000 # have this many frames per archive for
30 # the archives used for diagnostics.
32 num_diagnostic_archives=3 # we want to test the training likelihoods
33 # on a range of utterance lengths, and this number controls
34 # how many archives we evaluate on.
37 compress=true # set this to false to disable compression (e.g. if you want to see whether
38 # results are affected).
40 num_heldout_utts=100 # number of utterances held out for training subset
42 num_repeats=1 # number of times each speaker repeats per archive
44 stage=0
45 nj=6 # This should be set to the maximum number of jobs you are
46 # comfortable to run in parallel; you can increase it if your disk
47 # speed is greater and you have more machines.
49 echo "$0 $@" # Print the command line for logging
51 if [ -f path.sh ]; then . ./path.sh; fi
52 . parse_options.sh || exit 1;
54 if [ $# != 2 ]; then
55 echo "Usage: $0 [opts] <data> <egs-dir>"
56 echo " e.g.: $0 data/train exp/xvector_a/egs"
57 echo ""
58 echo "Main options (for others, see top of script file)"
59 echo " --config <config-file> # config file containing options"
60 echo " --nj <nj> # The maximum number of jobs you want to run in"
61 echo " # parallel (increase this only if you have good disk and"
62 echo " # network speed). default=6"
63 echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
64 echo " --min-frames-per-eg <#frames;50> # The minimum number of frames per chunk that we dump"
65 echo " --max-frames-per-eg <#frames;200> # The maximum number of frames per chunk that we dump"
66 echo " --num-repeats <#repeats;1> # The (approximate) number of times the training"
67 echo " # data is repeated in the egs"
68 echo " --frames-per-iter <#samples;1000000> # Target number of frames per archive"
69 echo " --num-diagnostic-archives <#archives;3> # Option that controls how many different versions of"
70 echo " # the train and validation archives we create (e.g."
71 echo " # train_subset.{1,2,3}.egs and valid.{1,2,3}.egs by default;"
72 echo " # they contain different utterance lengths."
73 echo " --frames-per-iter-diagnostic <#samples;100000> # Target number of frames for the diagnostic archives"
74 echo " # {train_subset,valid}.*.egs"
75 echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
76 echo " # the middle."
78 exit 1;
79 fi
81 data=$1
82 dir=$2
84 for f in $data/utt2num_frames $data/feats.scp ; do
85 [ ! -f $f ] && echo "$0: expected file $f" && exit 1;
86 done
88 feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1
90 mkdir -p $dir/info $dir/info $dir/temp
91 temp=$dir/temp
93 echo $feat_dim > $dir/info/feat_dim
94 echo '0' > $dir/info/left_context
95 # The examples have at least min_frames_per_chunk right context.
96 echo $min_frames_per_chunk > $dir/info/right_context
97 echo '1' > $dir/info/frames_per_eg
98 cp $data/utt2num_frames $dir/temp/utt2num_frames
100 if [ $stage -le 0 ]; then
101 echo "$0: Preparing train and validation lists"
102 # Pick a list of heldout utterances for validation egs
103 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/valid_uttlist || exit 1;
104 # The remaining utterances are used for training egs
105 utils/filter_scp.pl --exclude $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.train
106 utils/filter_scp.pl $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.valid
107 # Pick a subset of the training list for diagnostics
108 awk '{print $1}' $temp/utt2num_frames.train | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/train_subset_uttlist || exit 1;
109 utils/filter_scp.pl $temp/train_subset_uttlist <$temp/utt2num_frames.train > $temp/utt2num_frames.train_subset
110 # Create a mapping from utterance to speaker ID (an integer)
111 awk -v id=0 '{print $1, id++}' $data/spk2utt > $temp/spk2int
112 utils/sym2int.pl -f 2 $temp/spk2int $data/utt2spk > $temp/utt2int
113 utils/filter_scp.pl $temp/utt2num_frames.train $temp/utt2int > $temp/utt2int.train
114 utils/filter_scp.pl $temp/utt2num_frames.valid $temp/utt2int > $temp/utt2int.valid
115 utils/filter_scp.pl $temp/utt2num_frames.train_subset $temp/utt2int > $temp/utt2int.train_subset
116 fi
118 num_pdfs=$(awk '{print $2}' $temp/utt2int | sort | uniq -c | wc -l)
119 # The script assumes you've prepared the features ahead of time.
120 feats="scp,s,cs:utils/filter_scp.pl $temp/ranges.JOB $data/feats.scp |"
121 train_subset_feats="scp,s,cs:utils/filter_scp.pl $temp/train_subset_ranges.1 $data/feats.scp |"
122 valid_feats="scp,s,cs:utils/filter_scp.pl $temp/valid_ranges.1 $data/feats.scp |"
124 # first for the training data... work out how many archives.
125 num_train_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train)
126 num_train_subset_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train_subset)
128 echo $num_train_frames >$dir/info/num_frames
129 num_train_archives=$[($num_train_frames*$num_repeats)/$frames_per_iter + 1]
130 echo "$0: Producing $num_train_archives archives for training"
131 echo $num_train_archives > $dir/info/num_archives
132 echo $num_diagnostic_archives > $dir/info/num_diagnostic_archives
134 if [ $nj -gt $num_train_archives ]; then
135 echo "$0: Reducing num-jobs $nj to number of training archives $num_train_archives"
136 nj=$num_train_archives
137 fi
139 if [ $stage -le 1 ]; then
140 if [ -e $dir/storage ]; then
141 # Make soft links to storage directories, if distributing this way.. See
142 # utils/create_split_dir.pl.
143 echo "$0: creating data links"
144 utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs.$x.ark; done)
145 utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done)
146 fi
147 fi
149 if [ $stage -le 2 ]; then
150 echo "$0: Allocating training examples"
151 $cmd $dir/log/allocate_examples_train.log \
152 sid/nnet3/xvector/allocate_egs.py \
153 --num-repeats=$num_repeats \
154 --min-frames-per-chunk=$min_frames_per_chunk \
155 --max-frames-per-chunk=$max_frames_per_chunk \
156 --frames-per-iter=$frames_per_iter \
157 --num-archives=$num_train_archives --num-jobs=$nj \
158 --utt2len-filename=$dir/temp/utt2num_frames.train \
159 --utt2int-filename=$dir/temp/utt2int.train --egs-dir=$dir || exit 1
161 echo "$0: Allocating training subset examples"
162 $cmd $dir/log/allocate_examples_train_subset.log \
163 sid/nnet3/xvector/allocate_egs.py \
164 --prefix train_subset \
165 --num-repeats=1 \
166 --min-frames-per-chunk=$min_frames_per_chunk \
167 --max-frames-per-chunk=$max_frames_per_chunk \
168 --randomize-chunk-length false \
169 --frames-per-iter=$frames_per_iter_diagnostic \
170 --num-archives=$num_diagnostic_archives --num-jobs=1 \
171 --utt2len-filename=$dir/temp/utt2num_frames.train_subset \
172 --utt2int-filename=$dir/temp/utt2int.train_subset --egs-dir=$dir || exit 1
174 echo "$0: Allocating validation examples"
175 $cmd $dir/log/allocate_examples_valid.log \
176 sid/nnet3/xvector/allocate_egs.py \
177 --prefix valid \
178 --num-repeats=1 \
179 --min-frames-per-chunk=$min_frames_per_chunk \
180 --max-frames-per-chunk=$max_frames_per_chunk \
181 --randomize-chunk-length false \
182 --frames-per-iter=$frames_per_iter_diagnostic \
183 --num-archives=$num_diagnostic_archives --num-jobs=1 \
184 --utt2len-filename=$dir/temp/utt2num_frames.valid \
185 --utt2int-filename=$dir/temp/utt2int.valid --egs-dir=$dir || exit 1
186 fi
188 # At this stage we'll have created the ranges files that define how many egs
189 # there are and where they come from. If this is your first time running this
190 # script, you might decide to put an exit 1 command here, and inspect the
191 # contents of exp/$dir/temp/ranges.* before proceeding to the next stage.
192 if [ $stage -le 3 ]; then
193 echo "$0: Generating training examples on disk"
194 rm $dir/.error 2>/dev/null
195 for g in $(seq $nj); do
196 outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/outputs.$g)
197 $cmd $dir/log/train_create_examples.$g.log \
198 nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/ranges.$g \
199 "`echo $feats | sed s/JOB/$g/g`" $outputs || touch $dir/.error &
200 done
201 train_subset_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/train_subset_outputs.1)
202 echo "$0: Generating training subset examples on disk"
203 $cmd $dir/log/train_subset_create_examples.1.log \
204 nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/train_subset_ranges.1 \
205 "$train_subset_feats" $train_subset_outputs || touch $dir/.error &
206 wait
207 valid_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/valid_outputs.1)
208 echo "$0: Generating validation examples on disk"
209 $cmd $dir/log/valid_create_examples.1.log \
210 nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/valid_ranges.1 \
211 "$valid_feats" $valid_outputs || touch $dir/.error &
212 wait
213 if [ -f $dir/.error ]; then
214 echo "$0: Problem detected while dumping examples"
215 exit 1
216 fi
217 fi
219 if [ $stage -le 4 ]; then
220 echo "$0: Shuffling order of archives on disk"
221 $cmd --max-jobs-run $nj JOB=1:$num_train_archives $dir/log/shuffle.JOB.log \
222 nnet3-shuffle-egs --srand=JOB ark:$dir/egs_temp.JOB.ark \
223 ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp || exit 1;
224 $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/train_subset_shuffle.JOB.log \
225 nnet3-shuffle-egs --srand=JOB ark:$dir/train_subset_egs_temp.JOB.ark \
226 ark,scp:$dir/train_diagnostic_egs.JOB.ark,$dir/train_diagnostic_egs.JOB.scp || exit 1;
227 $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/valid_shuffle.JOB.log \
228 nnet3-shuffle-egs --srand=JOB ark:$dir/valid_egs_temp.JOB.ark \
229 ark,scp:$dir/valid_egs.JOB.ark,$dir/valid_egs.JOB.scp || exit 1;
230 fi
232 if [ $stage -le 5 ]; then
233 for file in $(for x in $(seq $num_diagnostic_archives); do echo $dir/train_subset_egs_temp.$x.ark; done) \
234 $(for x in $(seq $num_diagnostic_archives); do echo $dir/valid_egs_temp.$x.ark; done) \
235 $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done); do
236 [ -L $file ] && rm $(readlink -f $file)
237 rm $file
238 done
239 rm -rf $dir/valid_diagnostic.scp $dir/train_diagnostic.scp
240 for x in $(seq $num_diagnostic_archives); do
241 cat $dir/train_diagnostic_egs.$x.scp >> $dir/train_diagnostic.scp
242 cat $dir/valid_egs.$x.scp >> $dir/valid_diagnostic.scp
243 done
244 ln -sf train_diagnostic.scp $dir/combine.scp
245 fi
247 echo "$0: Finished preparing training examples"