[egs] Fixed some issues in the multilingual BABEL example scripts (#1850)
authorpegahgh <pegahgh@gmail.com>
Tue, 5 Sep 2017 21:24:25 +0000 (17:24 -0400)
committerDaniel Povey <dpovey@gmail.com>
Tue, 5 Sep 2017 21:24:25 +0000 (14:24 -0700)
egs/babel_multilang/s5/local/nnet3/run_common_langs.sh
egs/babel_multilang/s5/local/nnet3/run_decode_lang.sh [new file with mode: 0755]
egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh
egs/babel_multilang/s5/run-4-anydecode-langs.sh [deleted file]
egs/wsj/s5/steps/nnet3/decode.sh
egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh

index 41b091a61e57612f6cfea884af066a3f2dbd1c8c..63b7da82f604eb69cb494e4496df62958f60c178 100755 (executable)
@@ -35,7 +35,7 @@ fi
 
 if [ "$speed_perturb" == "true" ]; then
   if [ $stage -le 1 ]; then
-    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    #Although the nnet model will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
     # _sp stands for speed-perturbed
     for datadir in train; do
       if [ ! -d data/$lang/${datadir}_sp ]; then
diff --git a/egs/babel_multilang/s5/local/nnet3/run_decode_lang.sh b/egs/babel_multilang/s5/local/nnet3/run_decode_lang.sh
new file mode 100755 (executable)
index 0000000..bd80fe9
--- /dev/null
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+# Copyright 2016 Pegah Ghahremani
+
+# This script is used for decoding multilingual model and it is called in
+# local/nnet3/run_tdnn_multilingual.sh script.
+# This script needs decoding data dir, which is prepared using
+# eg/babel/s5d scripts (i.e. run-4-anydecode.sh).
+# If --use-pitch is true, pitch feature is added to high-resolution MFCC features.
+# If --use-bnf option is true, the --bnf-nnet-dir option, nnet3 model for
+# bottleneck feature extraction, should be provided.
+
+set -e
+set -o pipefail
+
+
+dir=dev10h.pem
+kind=
+use_pitch=true
+use_pitch_ivector=false # If true, pitch feature is used in ivector extraction.
+use_ivector=false
+decode_stage=-1
+nnet3_affix=
+feat_suffix=
+ivector_suffix=
+iter=final
+nj=30
+
+# params for extracting bn features
+use_bnf=false # If true, bottleneck feature is extracted and appended to input
+              # for nnet3 model.
+bnf_nnet_dir=exp/nnet3/multi_bnf_sp # dir for bottlneck nnet3 model
+                                    # (used for bottleneck feature extraction)
+use_ivector_bnf=false # If true, ivector used in extracting bottleneck features.
+
+. conf/common_vars.sh || exit 1;
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $(basename $0) --dir <dir-type> <lang> <multilingual-nnet3-dir>"
+  echo " e.g.: $(basename $0) --dir dev2h.pem ASM exp/nnet3/tdnn_multi_sp"
+  exit 1
+fi
+
+lang=$1
+nnet3_dir=$2
+
+langconf=conf/$lang/lang.conf
+
+if [ ! -f $langconf ]; then
+  echo "$0: Language configuration $langconf does not exist! Use the "
+  echo "configurations in ../../babel/s5d/conf/lang/$lang-* as a startup." && exit 1
+fi
+. $langconf || exit 1;
+[ -f local.conf ] && . local.conf;
+
+mfcc=mfcc/$lang
+data=data/$lang
+vector_suffix=_gb
+
+dataset_dir=$data/$dir
+dataset_id=$dir
+dataset_type=${dir%%.*}
+
+#By default, we want the script to accept how the dataset should be handled,
+#i.e. of  what kind is the dataset
+if [ -z ${kind} ] ; then
+  if [ "$dataset_type" == "dev2h" ] || [ "$dataset_type" == "dev10h" ]; then
+    dataset_kind=supervised
+  else
+    dataset_kind=unsupervised
+  fi
+else
+  dataset_kind=$kind
+fi
+
+dataset=$(basename $dataset_dir)
+mfccdir=mfcc_hires/$lang
+mfcc_affix=""
+hires_config="--mfcc-config conf/mfcc_hires.conf"
+nnet3_data_dir=${dataset_dir}_hires
+feat_suffix=_hires
+ivec_feat_suffix=_hires
+log_dir=exp/$lang/make_hires/$dataset
+
+if $use_pitch_ivector; then
+  ivec_feat_suffix=_hires_pitch
+fi
+
+if $use_pitch; then
+  mfcc_affix="_pitch_online"
+  hires_config="$hires_config --online-pitch-config conf/pitch.conf"
+  mfccdir=mfcc_hires_pitch/lang
+  nnet3_data_dir=${dataset_dir}_hires_pitch
+  feat_suffix="_hires_pitch"
+  log_dir=exp/$lang/make_hires_pitch/$dataset
+fi
+
+
+####################################################################
+##
+##  Feature extraction for decoding
+##
+####################################################################
+echo ---------------------------------------------------------------------
+echo "Preparing ${dataset_kind} data files in ${dataset_dir} on" `date`
+echo ---------------------------------------------------------------------
+if [ ! -f  $dataset_dir/.done ] ; then
+  if [ ! -f ${nnet3_data_dir}/.mfcc.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Preparing ${dataset_kind} MFCC features in  ${nnet3_data_dir} and corresponding "
+    echo "iVectors in exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset}${feat_suffix}${ivector_suffix} on" `date`
+    echo ---------------------------------------------------------------------
+    if [ ! -d ${nnet3_data_dir} ]; then
+      utils/copy_data_dir.sh $data/$dataset ${nnet3_data_dir}
+    fi
+
+    steps/make_mfcc${mfcc_affix}.sh --nj $nj $hires_config \
+        --cmd "$train_cmd" ${nnet3_data_dir} $log_dir $mfccdir;
+    steps/compute_cmvn_stats.sh ${nnet3_data_dir} $log_dir $mfccdir;
+    utils/fix_data_dir.sh ${nnet3_data_dir};
+    touch ${nnet3_data_dir}/.mfcc.done
+  fi
+  touch $dataset_dir/.done
+fi
+
+ivector_dir=exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset}${ivec_feat_suffix}${ivector_suffix}
+if $use_ivector && [ ! -f $ivector_dir/.ivector.done ];then
+  extractor=exp/multi/nnet3${nnet3_affix}/extractor
+  ivec_feat_suffix=$feat_suffix
+  if $use_pitch && ! $use_pitch_ivector; then
+    ivec_feat_suffix=_hires
+    featdir=${dataset_dir}${feat_suffix}
+    mfcc_only_dim=`feat-to-dim scp:$featdir/feats.scp - | awk '{print $1-3}'`
+    steps/select_feats.sh --cmd "$train_cmd" --nj $nj 0-$[$mfcc_only_dim-1] \
+      $featdir ${dataset_dir}${ivec_feat_suffix} || exit 1;
+    steps/compute_cmvn_stats.sh ${dataset_dir}${ivec_feat_suffix} || exit 1;
+  fi
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${dataset_dir}${ivec_feat_suffix} $extractor $ivector_dir || exit 1;
+  touch $ivector_dir/.ivector.done
+fi
+
+if $use_bnf; then
+  multi_ivector_dir=exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset}${ivec_feat_suffix}${ivector_suffix}
+
+  ivector_for_bnf_opt=
+  if $use_ivector_bnf;then ivector_for_bnf_opt="--ivector-dir $multi_ivector_dir"; fi
+
+  bnf_data_dir=${dataset_dir}_bnf/$lang
+  if [ ! -f $bnf_data_dir/.done ]; then
+    steps/nnet3/make_bottleneck_features.sh --use-gpu true --nj 100 --cmd "$train_cmd" \
+      $ivector_for_bnf_opt tdnn_bn.renorm \
+      ${dataset_dir}${feat_suffix} $bnf_data_dir \
+      $bnf_nnet_dir bnf/$lang exp/$lang/make_${dataset}_bnf || exit 1;
+    touch $bnf_data_dir/.done
+  else
+    echo "$0: Skip Bottleneck feature extraction; You can force to run this step deleting $bnf_data_dir/.done."
+  fi
+
+  appended_bnf=${dataset_dir}${feat_suffix}_bnf
+  if [ ! -f $appended_bnf/.done ]; then
+    steps/append_feats.sh  --nj 16 --cmd "$train_cmd" \
+      $bnf_data_dir ${dataset_dir}${feat_suffix} \
+      ${dataset_dir}${feat_suffix}_bnf exp/$lang/append${feat_suffix}_bnf \
+      mfcc${feat_suffix}_bnf/$lang || exit 1;
+
+    steps/compute_cmvn_stats.sh $appended_bnf exp/$lang/make_cmvn${feat_suffix}_bnf \
+      mfcc${feat_suffix}_bnf/$lang || exit 1;
+    touch $appended_bnf/.done
+  fi
+  feat_suffix=${feat_suffix}_bnf
+fi
+
+####################################################################
+##
+## nnet3 model decoding
+##
+####################################################################
+if [ ! -f exp/$lang/tri5/graph/HCLG.fst ];then
+  utils/mkgraph.sh \
+    data/$lang/lang exp/$lang/tri5 exp/$lang/tri5/graph |tee exp/$lang/tri5/mkgraph.log
+fi
+
+if [ -f $nnet3_dir/$lang/final.mdl ]; then
+  decode=$nnet3_dir/$lang/decode_${dataset_id}
+  feat_suffix=_hires
+  ivec_feat_suffix=_hires
+
+  # suffix for using other features such as pitch
+  if $use_pitch; then
+    feat_suffix=${feat_suffix}_pitch
+  fi
+  if $use_pitch_ivector; then
+    ivec_feat_suffix=_hires_pitch
+  fi
+  if $use_bnf; then
+    feat_suffix=${feat_suffix}_bnf
+  fi
+  ivector_opts=
+  if $use_ivector; then
+    ivector_opts="--online-ivector-dir exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset_id}${ivec_feat_suffix}${ivector_suffix}"
+  fi
+  if [ ! -f $decode/.done ]; then
+    mkdir -p $decode
+    score_opts="--skip-scoring false"
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $iter_opt \
+          --stage $decode_stage \
+          --beam $dnn_beam --lattice-beam $dnn_lat_beam \
+          $score_opts $ivector_opts \
+          exp/$lang/tri5/graph ${dataset_dir}${feat_suffix} $decode | tee $decode/decode.log
+
+    touch $decode/.done
+  fi
+fi
+
+echo "Everything looking good...."
+exit 0
index 5b05efb81ed2143706b7c5d31fad1c5ceb3540c0..65808822db39d2b0c6daea7355a2ae775824fa2a 100755 (executable)
@@ -304,11 +304,10 @@ if [ $stage -le 13 ]; then
   for lang_index in `seq 0 $[$num_decode_lang-1]`; do
     if [ ! -f $dir/${decode_lang_list[$lang_index]}/decode_dev10h.pem/.done ]; then
       echo "Decoding lang ${decode_lang_list[$lang_index]} using multilingual hybrid model $dir"
-      run-4-anydecode-langs.sh --use-ivector $use_ivector \
-        --use-pitch-ivector $use_pitch_ivector \
-        --nnet3-dir $dir --iter final_adj \
+      local/nnet3/run_decode_lang.sh --use-ivector $use_ivector \
+        --use-pitch-ivector $use_pitch_ivector --iter final_adj \
         --nnet3-affix "$nnet3_affix" \
-        ${decode_lang_list[$lang_index]} || exit 1;
+        ${decode_lang_list[$lang_index]} $dir || exit 1;
       touch $dir/${decode_lang_list[$lang_index]}/decode_dev10h.pem/.done
     fi
   done
diff --git a/egs/babel_multilang/s5/run-4-anydecode-langs.sh b/egs/babel_multilang/s5/run-4-anydecode-langs.sh
deleted file mode 100755 (executable)
index bd0db51..0000000
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/bin/bash
-set -e
-set -o pipefail
-
-
-dir=dev10h.pem
-kind=
-data_only=false
-skip_kws=false
-skip_scoring=
-extra_kws=true
-vocab_kws=false
-tri5_only=false
-use_pitch=true
-use_pitch_ivector=false # if true, pitch feature used in ivector extraction.
-use_ivector=false
-use_bnf=false
-pitch_conf=conf/pitch.conf
-wip=0.5
-decode_stage=-1
-nnet3_affix=
-nnet3_dir=nnet3/tdnn_sp
-is_rnn=false
-extra_left_context=0
-extra_right_context=0
-frames_per_chunk=0
-feat_suffix=
-ivector_suffix=
-iter=final
-
-# params for extracting bn features
-multidir=exp/nnet3/multi_bnf_sp
-dump_bnf_dir=bnf
-bnf_layer=5
-
-
-. conf/common_vars.sh || exit 1;
-
-. utils/parse_options.sh
-
-if [ $# -ne 1 ]; then
-  echo "Usage: $(basename $0) --dir <dir-type> <lang>"
-  echo " e.g.: $(basename $0) --dir dev2h.pem ASM"
-  exit 1
-fi
-
-lang=$1
-
-
-langconf=conf/$lang/lang.conf
-
-[ ! -f $langconf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1
-. $langconf || exit 1;
-[ -f local.conf ] && . local.conf;
-
-mfcc=mfcc/$lang
-plp=plp/$lang
-data=data/$lang
-vector_suffix=_gb
-#This seems to be the only functioning way how to ensure the comple
-#set of scripts will exit when sourcing several of them together
-#Otherwise, the CTRL-C just terminates the deepest sourced script ?
-# Let shell functions inherit ERR trap.  Same as `set -E'.
-set -o errtrace
-trap "echo Exited!; exit;" SIGINT SIGTERM
-
-# Set proxy search parameters for the extended lexicon case.
-if [ -f $data/.extlex ]; then
-  proxy_phone_beam=$extlex_proxy_phone_beam
-  proxy_phone_nbest=$extlex_proxy_phone_nbest
-  proxy_beam=$extlex_proxy_beam
-  proxy_nbest=$extlex_proxy_nbest
-fi
-
-dataset_segments=${dir##*.}
-dataset_dir=$data/$dir
-dataset_id=$dir
-dataset_type=${dir%%.*}
-#By default, we want the script to accept how the dataset should be handled,
-#i.e. of  what kind is the dataset
-if [ -z ${kind} ] ; then
-  if [ "$dataset_type" == "dev2h" ] || [ "$dataset_type" == "dev10h" ]; then
-    dataset_kind=supervised
-  else
-    dataset_kind=unsupervised
-  fi
-else
-  dataset_kind=$kind
-fi
-
-dataset=$(basename $dataset_dir)
-mfccdir=mfcc_hires/$lang
-mfcc_affix=""
-hires_config="--mfcc-config conf/mfcc_hires.conf"
-data_dir=${dataset_dir}_hires
-feat_suffix=_hires
-ivec_feat_suffix=_hires
-log_dir=exp/$lang/make_hires/$dataset
-
-if $use_pitch_ivector; then
-  ivec_feat_suffix=_hires_pitch
-fi
-
-if $use_pitch; then
-  mfcc_affix="_pitch_online"
-  hires_config="$hires_config --online-pitch-config $pitch_conf"
-  mfccdir=mfcc_hires_pitch/lang
-  data_dir=${dataset_dir}_hires_pitch
-  feat_suffix="_hires_pitch"
-  log_dir=exp/$lang/make_hires_pitch/$dataset
-fi
-
-if [ -z $dataset_segments ]; then
-  echo "You have to specify the segmentation type as well"
-  echo "If you are trying to decode the PEM segmentation dir"
-  echo "such as data/dev10h, specify dev10h.pem"
-  echo "The valid segmentations types are:"
-  echo "\tpem   #PEM segmentation"
-  echo "\tuem   #UEM segmentation in the CMU database format"
-  echo "\tseg   #UEM segmentation (kaldi-native)"
-fi
-
-if [ -z "${skip_scoring}" ] ; then
-  if [ "$dataset_kind" == "unsupervised" ]; then
-    skip_scoring=true
-  else
-    skip_scoring=false
-  fi
-fi
-
-#The $dataset_type value will be the dataset name without any extrension
-eval my_data_dir=( "\${${dataset_type}_data_dir[@]}" )
-eval my_data_list=( "\${${dataset_type}_data_list[@]}" )
-if [ -z $my_data_dir ] || [ -z $my_data_list ] ; then
-  echo "Error: The dir you specified ($dataset_id) does not have existing config";
-  exit 1
-fi
-
-eval my_stm_file=\$${dataset_type}_stm_file
-eval my_ecf_file=\$${dataset_type}_ecf_file
-eval my_rttm_file=\$${dataset_type}_rttm_file
-eval my_nj=\$${dataset_type}_nj  #for shadow, this will be re-set when appropriate
-
-if [ -z "$my_nj" ]; then
-  echo >&2 "You didn't specify the number of jobs -- variable \"${dataset_type}_nj\" not defined."
-  exit 1
-fi
-
-my_subset_ecf=false
-eval ind=\${${dataset_type}_subset_ecf+x}
-if [ "$ind" == "x" ] ; then
-  eval my_subset_ecf=\$${dataset_type}_subset_ecf
-fi
-
-declare -A my_kwlists=()
-eval my_kwlist_keys="\${!${dataset_type}_kwlists[@]}"
-for key in $my_kwlist_keys  # make sure you include the quotes there
-do
-  eval my_kwlist_val="\${${dataset_type}_kwlists[$key]}"
-  my_kwlists["$key"]="${my_kwlist_val}"
-done
-
-#Just a minor safety precaution to prevent using incorrect settings
-#The dataset_* variables should be used.
-set -e
-set -o pipefail
-set -u
-unset dir
-unset kind
-
-function make_plp {
-  target=$1
-  logdir=$2
-  output=$3
-  if $use_pitch; then
-    steps/make_plp_pitch.sh --cmd "$decode_cmd" --nj $my_nj $target $logdir $output
-  else
-    steps/make_plp.sh --cmd "$decode_cmd" --nj $my_nj $target $logdir $output
-  fi
-  utils/fix_data_dir.sh $target
-  steps/compute_cmvn_stats.sh $target $logdir $output
-  utils/fix_data_dir.sh $target
-}
-
-function check_variables_are_set {
-  for variable in $mandatory_variables ; do
-    if ! declare -p $variable ; then
-      echo "Mandatory variable ${variable/my/$dataset_type} is not set! "
-      echo "You should probably set the variable in the config file "
-      exit 1
-    else
-      declare -p $variable
-    fi
-  done
-
-  if [ ! -z ${optional_variables+x} ] ; then
-    for variable in $optional_variables ; do
-      eval my_variable=\$${variable}
-      echo "$variable=$my_variable"
-    done
-  fi
-}
-
-if [ ! -f $data/raw_${dataset_type}_data/.done ]; then
-  echo ---------------------------------------------------------------------
-  echo "Subsetting the ${dataset_type} set"
-  echo ---------------------------------------------------------------------
-
-  l1=${#my_data_dir[*]}
-  l2=${#my_data_list[*]}
-  if [ "$l1" -ne "$l2" ]; then
-    echo "Error, the number of source files lists is not the same as the number of source dirs!"
-    exit 1
-  fi
-
-  resource_string=""
-  if [ "$dataset_kind" == "unsupervised" ]; then
-    resource_string+=" --ignore-missing-txt true"
-  fi
-
-  for i in `seq 0 $(($l1 - 1))`; do
-    resource_string+=" ${my_data_dir[$i]} "
-    resource_string+=" ${my_data_list[$i]} "
-  done
-  local/make_corpus_subset.sh $resource_string ./$data/raw_${dataset_type}_data
-  touch $data/raw_${dataset_type}_data/.done
-fi
-my_data_dir=`utils/make_absolute.sh ./$data/raw_${dataset_type}_data`
-[ -f $my_data_dir/filelist.list ] && my_data_list=$my_data_dir/filelist.list
-nj_max=`cat $my_data_list | wc -l` || nj_max=`ls $my_data_dir/audio | wc -l`
-
-if [ "$nj_max" -lt "$my_nj" ] ; then
-  echo "Number of jobs ($my_nj) is too big!"
-  echo "The maximum reasonable number of jobs is $nj_max"
-  my_nj=$nj_max
-fi
-
-#####################################################################
-#
-# Audio data directory preparation
-#
-#####################################################################
-echo ---------------------------------------------------------------------
-echo "Preparing ${dataset_kind} data files in ${dataset_dir} on" `date`
-echo ---------------------------------------------------------------------
-if [ ! -f  $dataset_dir/.done ] ; then
-  if [ "$dataset_kind" == "supervised" ]; then
-    if [ "$dataset_segments" == "seg" ]; then
-      . ./local/datasets/supervised_seg.sh || exit 1
-    elif [ "$dataset_segments" == "uem" ]; then
-      . ./local/datasets/supervised_uem.sh || exit 1
-    elif [ "$dataset_segments" == "pem" ]; then
-      . ./local/datasets/supervised_pem.sh || exit 1
-    else
-      echo "Unknown type of the dataset: \"$dataset_segments\"!";
-      echo "Valid dataset types are: seg, uem, pem";
-      exit 1
-    fi
-  elif [ "$dataset_kind" == "unsupervised" ] ; then
-    if [ "$dataset_segments" == "seg" ] ; then
-      . ./local/datasets/unsupervised_seg.sh
-    elif [ "$dataset_segments" == "uem" ] ; then
-      . ./local/datasets/unsupervised_uem.sh
-    elif [ "$dataset_segments" == "pem" ] ; then
-      ##This combination does not really makes sense,
-      ##Because the PEM is that we get the segmentation
-      ##and because of the format of the segment files
-      ##the transcript as well
-      echo "ERROR: $dataset_segments combined with $dataset_type"
-      echo "does not really make any sense!"
-      exit 1
-      #. ./local/datasets/unsupervised_pem.sh
-    else
-      echo "Unknown type of the dataset: \"$dataset_segments\"!";
-      echo "Valid dataset types are: seg, uem, pem";
-      exit 1
-    fi
-  else
-    echo "Unknown kind of the dataset: \"$dataset_kind\"!";
-    echo "Valid dataset kinds are: supervised, unsupervised, shadow";
-    exit 1
-  fi
-
-  if [ ! -f ${dataset_dir}/.plp.done ]; then
-    echo ---------------------------------------------------------------------
-    echo "Preparing ${dataset_kind} parametrization files in ${dataset_dir} on" `date`
-    echo ---------------------------------------------------------------------
-    make_plp ${dataset_dir} exp/$lang/make_plp/${dataset_id} plp/$lang
-    touch ${dataset_dir}/.plp.done
-  fi
-
-
-  if [ ! -f ${data_dir}/.mfcc.done ]; then
-    echo ---------------------------------------------------------------------
-    echo "Preparing ${dataset_kind} MFCC features in  ${data_dir} and corresponding iVectors in exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset}${feat_suffix}${ivector_suffix} on" `date`
-    echo ---------------------------------------------------------------------
-    if [ ! -d ${data_dir} ]; then
-      utils/copy_data_dir.sh $data/$dataset ${data_dir}
-    fi
-
-
-    steps/make_mfcc${mfcc_affix}.sh --nj $my_nj $hires_config \
-        --cmd "$train_cmd" ${data_dir} $log_dir $mfccdir;
-    steps/compute_cmvn_stats.sh ${data_dir} $log_dir $mfccdir;
-    utils/fix_data_dir.sh ${data_dir};
-    touch ${data_dir}/.mfcc.done
-  fi
-  touch $dataset_dir/.done
-fi
-
-# extract ivector
-dataset=$(basename $dataset_dir)
-ivector_dir=exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset}${ivec_feat_suffix}${ivector_suffix}
-if $use_ivector && [ ! -f $ivector_dir/.ivector.done ];then
-  extractor=exp/multi/nnet3${nnet3_affix}/extractor
-  ivec_feat_suffix=$feat_suffix
-  if $use_pitch && ! $use_pitch_ivector; then
-    ivec_feat_suffix=_hires
-    featdir=${dataset_dir}${feat_suffix}
-    mfcc_only_dim=`feat-to-dim scp:$featdir/feats.scp - | awk '{print $1-3}'`
-    steps/select_feats.sh --cmd "$train_cmd" --nj $my_nj 0-$[$mfcc_only_dim-1] \
-      $featdir ${dataset_dir}${ivec_feat_suffix} || exit 1;
-    steps/compute_cmvn_stats.sh ${dataset_dir}${ivec_feat_suffix} || exit 1;
-  fi
-
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $my_nj \
-    ${dataset_dir}${ivec_feat_suffix} $extractor $ivector_dir || exit 1;
-  touch $ivector_dir/.ivector.done
-fi
-
-if $use_bnf; then
-  # put the archives in ${dump_bnf_dir}/.
-  dataset=$(basename $dataset_dir)
-  multi_ivector_dir=exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset}${ivec_feat_suffix}${ivector_suffix}
-  bnf_data_dir=${dataset_dir}_bnf/$lang
-  if [ ! -f $bnf_data_dir/.done ]; then
-  steps/nnet3/dump_bottleneck_features.sh --use-gpu true --nj 100 --cmd "$train_cmd" \
-    --ivector-dir $multi_ivector_dir \
-    --feat-type raw \
-    ${dataset_dir}${feat_suffix} $bnf_data_dir \
-    $multidir $dump_bnf_dir/$lang exp/$lang/make_${dataset}_bnf || exit 1;
-  touch $bnf_data_dir/.done
-  fi
-  appended_bnf=${dataset_dir}${feat_suffix}_bnf
-  if [ ! -f $appended_bnf/.done ]; then
-    steps/append_feats.sh  --nj 16 --cmd "$train_cmd" \
-      $bnf_data_dir ${dataset_dir}${feat_suffix} \
-      ${dataset_dir}${feat_suffix}_bnf exp/$lang/append${feat_suffix}_bnf \
-      mfcc${feat_suffix}_bnf/$lang || exit 1;
-
-    steps/compute_cmvn_stats.sh $appended_bnf exp/$lang/make_cmvn${feat_suffix}_bnf \
-      mfcc${feat_suffix}_bnf/$lang || exit 1;
-    touch $appended_bnf/.done
-  fi
-  feat_suffix=${feat_suffix}_bnf
-fi
-
-#####################################################################
-#
-# KWS data directory preparation
-#
-#####################################################################
-echo ---------------------------------------------------------------------
-echo "Preparing kws data files in ${dataset_dir} on" `date`
-echo ---------------------------------------------------------------------
-
-if ! $skip_kws ; then
-  if  $extra_kws ; then
-    L1_lex=data/local/lexiconp.txt
-    . ./local/datasets/extra_kws.sh || exit 1
-  fi
-  if  $vocab_kws ; then
-    . ./local/datasets/vocab_kws.sh || exit 1
-  fi
-fi
-if $data_only ; then
-  echo "Exiting, as data-only was requested..."
-  exit 0;
-fi
-
-####################################################################
-## FMLLR decoding
-##
-####################################################################
-decode=exp/$lang/tri5/decode_${dataset_id}
-if [ ! -f exp/$lang/tri5/graph/HCLG.fst ];then
-  utils/mkgraph.sh \
-    data/$lang/lang exp/$lang/tri5 exp/$lang/tri5/graph |tee exp/$lang/tri5/mkgraph.log
-fi
-if [ ! -f ${decode}/.done ]; then
-  echo ---------------------------------------------------------------------
-  echo "Spawning decoding with SAT models  on" `date`
-  echo ---------------------------------------------------------------------
-  utils/mkgraph.sh \
-    data/$lang/lang exp/$lang/tri5 exp/$lang/tri5/graph |tee exp/$lang/tri5/mkgraph.log
-
-  mkdir -p $decode
-  #By default, we do not care about the lattices for this step -- we just want the transforms
-  #Therefore, we will reduce the beam sizes, to reduce the decoding times
-  steps/decode_fmllr_extra.sh --skip-scoring true --beam 10 --lattice-beam 4\
-    --nj $my_nj --cmd "$decode_cmd" "${decode_extra_opts[@]}"\
-    exp/$lang/tri5/graph ${dataset_dir} ${decode} |tee ${decode}/decode.log
-  touch ${decode}/.done
-fi
-
-
-if $tri5_only; then
-  echo "--tri5-only is true. So exiting."
-  exit 0
-fi
-
-####################################################################
-##
-## nnet3 model decoding
-##
-####################################################################
-
-if [ -f $nnet3_dir/$lang/final.mdl ]; then
-  decode=$nnet3_dir/$lang/decode_${dataset_id}
-  rnn_opts=
-  feat_suffix=_hires
-  ivec_feat_suffix=_hires
-
-  # suffix for using other features such as pitch
-  if $use_pitch; then
-    feat_suffix=${feat_suffix}_pitch
-  fi
-  if $use_pitch_ivector; then
-    ivec_feat_suffix=_hires_pitch
-  fi
-  if $use_bnf; then
-    feat_suffix=${feat_suffix}_bnf
-  fi
-  ivector_opts=
-  if $use_ivector; then
-    ivector_opts="--online-ivector-dir exp/$lang/nnet3${nnet3_affix}/ivectors_${dataset_id}${ivec_feat_suffix}${ivector_suffix}"
-  fi
-  if [ "$is_rnn" == "true" ]; then
-    rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context  --frames-per-chunk $frames_per_chunk "
-  fi
-  if [ ! -f $decode/.done ]; then
-    mkdir -p $decode
-    score_opts="--skip-scoring false"
-    [ ! -z $iter ] && iter_opt="--iter $iter"
-    steps/nnet3/decode.sh --nj $my_nj --cmd "$decode_cmd" $iter_opt $rnn_opts \
-          --stage $decode_stage \
-          --beam $dnn_beam --lattice-beam $dnn_lat_beam \
-          $score_opts $ivector_opts \
-          exp/$lang/tri5/graph ${dataset_dir}${feat_suffix} $decode | tee $decode/decode.log
-
-    touch $decode/.done
-  fi
-fi
-
-echo "Everything looking good...."
-exit 0
index d1c7d24d8292641a32d554be8b8c145dd2be948f..0484fd229a60c72336cbb89e631ccbc2de9e2168 100755 (executable)
@@ -87,8 +87,6 @@ echo $nj > $dir/num_jobs
 ## Set up features.
 echo "$0: feature type is raw"
 
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-
 feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
 if [ ! -z "$transform_dir" ]; then
   echo "$0: using transforms from $transform_dir"
index 7773b38918c74c179980b99988ea4e12da761631..09f4263918a5ea68db36f5142468ceb43236936a 100755 (executable)
@@ -39,7 +39,7 @@ if [ $# -gt 4 ]; then
 else
   logdir=$bnf_data/log
 fi
-if [ $# -gt 5]; then
+if [ $# -gt 5 ]; then
   bnfdir=$6
 else
   bnfdir=$bnf_data/data
@@ -76,7 +76,7 @@ mkdir -p $logdir
 mkdir -p $bnf_data
 mkdir -p $bnfdir
 echo $nj > $nnetdir/num_jobs
-splice_opts=`cat $nnetdir/splice_opts 2>/dev/null`
+
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
 use_ivector=false