egs/sre08/v1/sid/nnet3/xvector/get_egs.sh

   1 #!/bin/bash
   2
   3 # Copyright      2017 Johns Hopkins University (Author: Daniel Povey)
   4 #                2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
   5 #                2017 David Snyder
   6 # Apache 2.0
   7 #
   8 # This script dumps training examples (egs) for multiclass xvector training.
   9 # These egs consist of a data chunk and a zero-based speaker label.
  10 # Each archive of egs has, in general, a different input chunk-size.
  11 # We don't mix together different lengths in the same archive, because it
  12 # would require us to repeatedly run the compilation process within the same
  13 # training job.
  14 #
  15 # This script, which will generally be called from other neural net training
  16 # scripts, extracts the training examples used to train the neural net (and
  17 # also the validation examples used for diagnostics), and puts them in
  18 # separate archives.
  19
  20
  21 # Begin configuration section.
  22 cmd=run.pl
  23 # each archive has data-chunks off length randomly chosen between
  24 # $min_frames_per_eg and $max_frames_per_eg.
  25 min_frames_per_chunk=50
  26 max_frames_per_chunk=300
  27 frames_per_iter=10000000 # target number of frames per archive.
  28
  29 frames_per_iter_diagnostic=100000 # have this many frames per archive for
  30                                    # the archives used for diagnostics.
  31
  32 num_diagnostic_archives=3  # we want to test the training likelihoods
  33                            # on a range of utterance lengths, and this number controls
  34                            # how many archives we evaluate on.
  35
  36
  37 compress=true   # set this to false to disable compression (e.g. if you want to see whether
  38                 # results are affected).
  39
  40 num_heldout_utts=100     # number of utterances held out for training subset
  41
  42 num_repeats=1 # number of times each speaker repeats per archive
  43
  44 stage=0
  45 nj=6         # This should be set to the maximum number of jobs you are
  46              # comfortable to run in parallel; you can increase it if your disk
  47              # speed is greater and you have more machines.
  48
  49 echo "$0 $@"  # Print the command line for logging
  50
  51 if [ -f path.sh ]; then . ./path.sh; fi
  52 . parse_options.sh || exit 1;
  53
  54 if [ $# != 2 ]; then
  55   echo "Usage: $0 [opts] <data> <egs-dir>"
  56   echo " e.g.: $0 data/train exp/xvector_a/egs"
  57   echo ""
  58   echo "Main options (for others, see top of script file)"
  59   echo "  --config <config-file>                           # config file containing options"
  60   echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
  61   echo "                                                   # parallel (increase this only if you have good disk and"
  62   echo "                                                   # network speed).  default=6"
  63   echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
  64   echo "  --min-frames-per-eg <#frames;50>                 # The minimum number of frames per chunk that we dump"
  65   echo "  --max-frames-per-eg <#frames;200>                # The maximum number of frames per chunk that we dump"
  66   echo "  --num-repeats <#repeats;1>                       # The (approximate) number of times the training"
  67   echo "                                                   # data is repeated in the egs"
  68   echo "  --frames-per-iter <#samples;1000000>             # Target number of frames per archive"
  69   echo "  --num-diagnostic-archives <#archives;3>          # Option that controls how many different versions of"
  70   echo "                                                   # the train and validation archives we create (e.g."
  71   echo "                                                   # train_subset.{1,2,3}.egs and valid.{1,2,3}.egs by default;"
  72   echo "                                                   # they contain different utterance lengths."
  73   echo "  --frames-per-iter-diagnostic <#samples;100000>   # Target number of frames for the diagnostic archives"
  74   echo "                                                   # {train_subset,valid}.*.egs"
  75   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
  76   echo "                                                   # the middle."
  77
  78   exit 1;
  79 fi
  80
  81 data=$1
  82 dir=$2
  83
  84 for f in $data/utt2num_frames $data/feats.scp ; do
  85   [ ! -f $f ] && echo "$0: expected file $f" && exit 1;
  86 done
  87
  88 feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1
  89
  90 mkdir -p $dir/info $dir/info $dir/temp
  91 temp=$dir/temp
  92
  93 echo $feat_dim > $dir/info/feat_dim
  94 echo '0' > $dir/info/left_context
  95 # The examples have at least min_frames_per_chunk right context.
  96 echo $min_frames_per_chunk > $dir/info/right_context
  97 echo '1' > $dir/info/frames_per_eg
  98 cp $data/utt2num_frames $dir/temp/utt2num_frames
  99
 100 if [ $stage -le 0 ]; then
 101   echo "$0: Preparing train and validation lists"
 102   # Pick a list of heldout utterances for validation egs
 103   awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/valid_uttlist || exit 1;
 104   # The remaining utterances are used for training egs
 105   utils/filter_scp.pl --exclude $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.train
 106   utils/filter_scp.pl $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.valid
 107   # Pick a subset of the training list for diagnostics
 108   awk '{print $1}' $temp/utt2num_frames.train | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/train_subset_uttlist || exit 1;
 109   utils/filter_scp.pl $temp/train_subset_uttlist <$temp/utt2num_frames.train > $temp/utt2num_frames.train_subset
 110   # Create a mapping from utterance to speaker ID (an integer)
 111   awk -v id=0 '{print $1, id++}' $data/spk2utt > $temp/spk2int
 112   utils/sym2int.pl -f 2 $temp/spk2int $data/utt2spk > $temp/utt2int
 113   utils/filter_scp.pl $temp/utt2num_frames.train $temp/utt2int > $temp/utt2int.train
 114   utils/filter_scp.pl $temp/utt2num_frames.valid $temp/utt2int > $temp/utt2int.valid
 115   utils/filter_scp.pl $temp/utt2num_frames.train_subset $temp/utt2int > $temp/utt2int.train_subset
 116 fi
 117
 118 num_pdfs=$(awk '{print $2}' $temp/utt2int | sort | uniq -c | wc -l)
 119 # The script assumes you've prepared the features ahead of time.
 120 feats="scp,s,cs:utils/filter_scp.pl $temp/ranges.JOB $data/feats.scp |"
 121 train_subset_feats="scp,s,cs:utils/filter_scp.pl $temp/train_subset_ranges.1 $data/feats.scp |"
 122 valid_feats="scp,s,cs:utils/filter_scp.pl $temp/valid_ranges.1 $data/feats.scp |"
 123
 124 # first for the training data... work out how many archives.
 125 num_train_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train)
 126 num_train_subset_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train_subset)
 127
 128 echo $num_train_frames >$dir/info/num_frames
 129 num_train_archives=$[($num_train_frames*$num_repeats)/$frames_per_iter + 1]
 130 echo "$0: Producing $num_train_archives archives for training"
 131 echo $num_train_archives > $dir/info/num_archives
 132 echo $num_diagnostic_archives > $dir/info/num_diagnostic_archives
 133
 134 if [ $nj -gt $num_train_archives ]; then
 135   echo "$0: Reducing num-jobs $nj to number of training archives $num_train_archives"
 136   nj=$num_train_archives
 137 fi
 138
 139 if [ $stage -le 1 ]; then
 140   if [ -e $dir/storage ]; then
 141     # Make soft links to storage directories, if distributing this way..  See
 142     # utils/create_split_dir.pl.
 143     echo "$0: creating data links"
 144     utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs.$x.ark; done)
 145     utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done)
 146   fi
 147 fi
 148
 149 if [ $stage -le 2 ]; then
 150   echo "$0: Allocating training examples"
 151   $cmd $dir/log/allocate_examples_train.log \
 152     sid/nnet3/xvector/allocate_egs.py \
 153       --num-repeats=$num_repeats \
 154       --min-frames-per-chunk=$min_frames_per_chunk \
 155       --max-frames-per-chunk=$max_frames_per_chunk \
 156       --frames-per-iter=$frames_per_iter \
 157       --num-archives=$num_train_archives --num-jobs=$nj \
 158       --utt2len-filename=$dir/temp/utt2num_frames.train \
 159       --utt2int-filename=$dir/temp/utt2int.train --egs-dir=$dir  || exit 1
 160
 161   echo "$0: Allocating training subset examples"
 162   $cmd $dir/log/allocate_examples_train_subset.log \
 163     sid/nnet3/xvector/allocate_egs.py \
 164       --prefix train_subset \
 165       --num-repeats=1 \
 166       --min-frames-per-chunk=$min_frames_per_chunk \
 167       --max-frames-per-chunk=$max_frames_per_chunk \
 168       --randomize-chunk-length false \
 169       --frames-per-iter=$frames_per_iter_diagnostic \
 170       --num-archives=$num_diagnostic_archives --num-jobs=1 \
 171       --utt2len-filename=$dir/temp/utt2num_frames.train_subset \
 172       --utt2int-filename=$dir/temp/utt2int.train_subset --egs-dir=$dir  || exit 1
 173
 174   echo "$0: Allocating validation examples"
 175   $cmd $dir/log/allocate_examples_valid.log \
 176     sid/nnet3/xvector/allocate_egs.py \
 177       --prefix valid \
 178       --num-repeats=1 \
 179       --min-frames-per-chunk=$min_frames_per_chunk \
 180       --max-frames-per-chunk=$max_frames_per_chunk \
 181       --randomize-chunk-length false \
 182       --frames-per-iter=$frames_per_iter_diagnostic \
 183       --num-archives=$num_diagnostic_archives --num-jobs=1 \
 184       --utt2len-filename=$dir/temp/utt2num_frames.valid \
 185       --utt2int-filename=$dir/temp/utt2int.valid --egs-dir=$dir  || exit 1
 186 fi
 187
 188 # At this stage we'll have created the ranges files that define how many egs
 189 # there are and where they come from.  If this is your first time running this
 190 # script, you might decide to put an exit 1 command here, and inspect the
 191 # contents of exp/$dir/temp/ranges.* before proceeding to the next stage.
 192 if [ $stage -le 3 ]; then
 193   echo "$0: Generating training examples on disk"
 194   rm $dir/.error 2>/dev/null
 195   for g in $(seq $nj); do
 196     outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/outputs.$g)
 197     $cmd $dir/log/train_create_examples.$g.log \
 198       nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/ranges.$g \
 199       "`echo $feats | sed s/JOB/$g/g`" $outputs || touch $dir/.error &
 200   done
 201   train_subset_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/train_subset_outputs.1)
 202   echo "$0: Generating training subset examples on disk"
 203   $cmd $dir/log/train_subset_create_examples.1.log \
 204     nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/train_subset_ranges.1 \
 205     "$train_subset_feats" $train_subset_outputs || touch $dir/.error &
 206   wait
 207   valid_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/valid_outputs.1)
 208   echo "$0: Generating validation examples on disk"
 209   $cmd $dir/log/valid_create_examples.1.log \
 210     nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/valid_ranges.1 \
 211     "$valid_feats" $valid_outputs || touch $dir/.error &
 212   wait
 213   if [ -f $dir/.error ]; then
 214     echo "$0: Problem detected while dumping examples"
 215     exit 1
 216   fi
 217 fi
 218
 219 if [ $stage -le 4 ]; then
 220   echo "$0: Shuffling order of archives on disk"
 221   $cmd --max-jobs-run $nj JOB=1:$num_train_archives $dir/log/shuffle.JOB.log \
 222     nnet3-shuffle-egs --srand=JOB ark:$dir/egs_temp.JOB.ark \
 223     ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp || exit 1;
 224   $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/train_subset_shuffle.JOB.log \
 225     nnet3-shuffle-egs --srand=JOB ark:$dir/train_subset_egs_temp.JOB.ark \
 226     ark,scp:$dir/train_diagnostic_egs.JOB.ark,$dir/train_diagnostic_egs.JOB.scp || exit 1;
 227   $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/valid_shuffle.JOB.log \
 228     nnet3-shuffle-egs --srand=JOB ark:$dir/valid_egs_temp.JOB.ark \
 229     ark,scp:$dir/valid_egs.JOB.ark,$dir/valid_egs.JOB.scp || exit 1;
 230 fi
 231
 232 if [ $stage -le 5 ]; then
 233   for file in $(for x in $(seq $num_diagnostic_archives); do echo $dir/train_subset_egs_temp.$x.ark; done) \
 234     $(for x in $(seq $num_diagnostic_archives); do echo $dir/valid_egs_temp.$x.ark; done) \
 235     $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done); do
 236     [ -L $file ] && rm $(readlink -f $file)
 237     rm $file
 238   done
 239   rm -rf $dir/valid_diagnostic.scp $dir/train_diagnostic.scp
 240   for x in $(seq $num_diagnostic_archives); do
 241     cat $dir/train_diagnostic_egs.$x.scp >> $dir/train_diagnostic.scp
 242     cat $dir/valid_egs.$x.scp >> $dir/valid_diagnostic.scp
 243   done
 244   ln -sf train_diagnostic.scp $dir/combine.scp
 245 fi
 246
 247 echo "$0: Finished preparing training examples"