summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 5f15d25)
raw | patch | inline | side by side (parent: 5f15d25)
author | LvHang <hanglv@nwpu-aslp.org> | |
Wed, 11 Jan 2017 05:56:15 +0000 (00:56 -0500) | ||
committer | Daniel Povey <dpovey@gmail.com> | |
Wed, 11 Jan 2017 05:56:15 +0000 (21:56 -0800) |
103 files changed:
diff --git a/egs/aurora4/s5/local/run_sgmm.sh b/egs/aurora4/s5/local/run_sgmm.sh
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/bash
-
-# This script is invoked from ../run.sh
-# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
-
-. cmd.sh
-
-# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
-# training, but this shouldn't have much effect.
-
-(
- steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
- data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
-
- steps/train_ubm.sh --cmd "$train_cmd" \
- 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
-
- steps/train_sgmm.sh --cmd "$train_cmd" \
- 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \
- exp/ubm5b/final.ubm exp/sgmm5a || exit 1;
-
- (
- utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93
- ) &
-
- steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
- --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1;
- steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
- data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
- data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
- exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
- done
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
- --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
- exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
- done
-
-) &
-
-
-(
-# The next commands are the same thing on all the si284 data.
-
-# SGMM system on the si284 data [sgmm5b]
- steps/train_ubm.sh --cmd "$train_cmd" \
- 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
-
- steps/train_sgmm.sh --cmd "$train_cmd" \
- 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
- exp/ubm5b/final.ubm exp/sgmm5b || exit 1;
-
- (
- utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93
- steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
- exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92
-
- utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1;
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
- exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93
- steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
- exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92
- ) &
-
- steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
- --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284
-
- steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
- data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
- data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1
-
- for iter in 1 2 3 4; do
- for test in dev93 eval92; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \
- exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
-
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \
- exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
- done
- done
-) &
-
-
-
-# Train quinphone SGMM system.
-
-steps/train_sgmm.sh --cmd "$train_cmd" \
- --context-opts "--context-width=5 --central-position=2" \
- 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
- exp/ubm5b/final.ubm exp/sgmm5c || exit 1;
-
-# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93.
-steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93
-
index 3be49854038a7fa86e0b28a806dd1137ad64aa08..59b2fdad3c9b1d46c6b03e4e6c1d9e3b191ee558 100755 (executable)
utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1
steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
$MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1
-elif [ "$1" == "SGMM" ]; then
- utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1
-
- steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \
- $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1;
-
- steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\
- $MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr || exit 1;
-
fi
index 3be49854038a7fa86e0b28a806dd1137ad64aa08..59b2fdad3c9b1d46c6b03e4e6c1d9e3b191ee558 100755 (executable)
utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1
steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
$MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1
-elif [ "$1" == "SGMM" ]; then
- utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1
-
- steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \
- $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1;
-
- steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\
- $MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr || exit 1;
-
fi
index 3be49854038a7fa86e0b28a806dd1137ad64aa08..59b2fdad3c9b1d46c6b03e4e6c1d9e3b191ee558 100755 (executable)
utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1
steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
$MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1
-elif [ "$1" == "SGMM" ]; then
- utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1
-
- steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \
- $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1;
-
- steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\
- $MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr || exit 1;
-
fi
diff --git a/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh b/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh
+++ /dev/null
@@ -1,359 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012 Arnab Ghoshal
-# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This is Subspace Gaussian Mixture Model (SGMM) training--
-# see "The subspace Gaussian mixture model--A structured model for speech recognition"
-# by D. Povey et al, Computer Speech and Language, 2011.
-
-function error_exit () {
- echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
- local retval=${1/#*=/}; # In case --switch=ARG format was used
-# retval=${retval#0*} # Strip any leading 0's
- [[ "$retval" =~ ^-?[0-9][0-9]*$ ]] \
- || error_exit "Argument \"$retval\" not an integer."
- echo $retval
-}
-
-function est_alimodel () {
-# If we have speaker vectors, we need an alignment model. This function gets
-# the Gaussian-level alignments with the speaker vectors but accumulates stats
-# without any speaker vectors; we re-estimate M, w, c and S to get a model
-# that's compatible with not having speaker vectors. Note that the transitions
-# are not updated since the decoding graph will be shared with the normal model.
- local lx=$1
- for L in $LANGUAGES; do
- wdir=$dir/$L
- local lspkdim=`sgmm-info $wdir/$lx.mdl | grep speaker | awk '{print $NF}'`
- if [ "$lspkdim" -le 0 ]; then
- echo "est_alimodel: No speaker space in model '$wdir/$lx.mdl'. Returning."
- return
- fi
- done
-
- local y=0;
- local lflags=MwcS # First time don't update v
- while [ $y -lt $numiters_alimdl ]; do
- [ $y -gt 0 ] && lflags=vMwcS
- echo "Pass $y of building alignment model, flags = '$lflags'"
- local lmulti_est_opts='' # model, acc, model-out, occs-out tuples
- for L in $LANGUAGES; do
- (
- data=data/$L/train
- lang=data/$L/lang
- wdir=$dir/$L
- local cur_alimdl=$wdir/tmp$y.alimdl
- [ $y -eq 0 ] && cur_alimdl=$wdir/$lx.mdl
- feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
- gselect_opt="--gselect=ark,s,cs:gunzip -c $wdir/TASK_ID.gselect.gz|"
- spkvecs_opt="--spk-vecs=ark:$wdir/TASK_ID.vecs"
-
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/acc_ali${lx}_$y.TASK_ID.log \
- $sjopts ali-to-post "ark:gunzip -c $wdir/TASK_ID.ali.gz|" ark:- \| \
- sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \
- --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk $wdir/$lx.mdl \
- "$feats" ark,s,cs:- ark:- \| \
- sgmm-acc-stats-gpost --update-flags=$lflags $cur_alimdl "$feats" \
- ark,s,cs:- $wdir/$y.TASK_ID.aliacc \
- || { touch $dir/err; \
- error_exit "$L; Align model iter $y: Error accumulating stats"; }
-
- # Summing accs is quite fast; run locally
- sgmm-sum-accs $wdir/sum.aliacc $wdir/$y.*.aliacc || \
- { touch $dir/err; \
- error_exit "$L; Align model iter $y: Error summing stats"; }
- )& # Accumulate in parallel for different languages
- wdir=$dir/$L
- local cur_alimdl=$wdir/tmp$y.alimdl
- [ $y -eq 0 ] && cur_alimdl=$wdir/$lx.mdl
- lmulti_est_opts="$lmulti_est_opts $cur_alimdl $wdir/sum.aliacc $wdir/tmp$[$y+1].alimdl $wdir/tmp$[$y+1].occs"
- done
- wait
-
- submit_jobs.sh "$qcmd" --log=$dir/log/update_ali.$y.log $sjopts \
- sgmm-est-multi --update-flags=$lflags --remove-speaker-space=true \
- $lmulti_est_opts \
- || error_exit "Error estimating alignment models on iter $y";
-
- rm -f $dir/??/$y.*.aliacc $dir/??/sum.aliacc || exit 1;
- [ $y -gt 0 ] && rm $dir/??/tmp$y.{alimdl,occs}
- y=$[$y+1]
- done
-
- for L in $LANGUAGES; do
- mv $dir/$L/tmp$y.alimdl $dir/$L/$lx.alimdl
- done
-}
-
-nj=4 # Default number of jobs
-stage=-5 # Default starting stage (start with tree building)
-qcmd="" # Options for the submit_jobs.sh script
-sjopts="" # Options for the submit_jobs.sh script
-LANGUAGES='GE PO SP SW' # Languages processed
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <phone-dim> <spk-dim> <ubm> <out-dir>\n
-e.g.: $PROG 40 39 exp/ubm3c/final.ubm exp/sgmm3c\n\n
-Options:\n
- --help\t\tPrint this message and exit\n
- --lang STR\tList of languages to process (default = '$LANGUAGES')\n
- --num-jobs INT\tNumber of parallel jobs to run (default=$nj).\n
- --qcmd STR\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
- --sjopts STR\tOptions for the 'submit_jobs.sh' script\n
- --stage INT\tStarting stage (e.g. -4 for SGMM init; 2 for iter 2; default=$stage)\n
-";
-
-echo "$PROG $@"
-while [ $# -gt 0 ]; do
- case "${1# *}" in # ${1# *} strips any leading spaces from the arguments
- --help) echo -e $usage; exit 0 ;;
- --lang) LANGUAGES="$2"; shift 2 ;;
- --num-jobs)
- shift; nj=`readint $1`;
- [ $nj -lt 1 ] && error_exit "--num-jobs arg '$nj' not positive.";
- shift ;;
- --qcmd)
- shift; qcmd=" --qcmd=${1}"; shift ;;
- --sjopts)
- shift; sjopts="$1"; shift ;;
- --stage)
- shift; stage=`readint $1`; shift ;;
- -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
- *) break ;; # end of options: interpreted as num-leaves
- esac
-done
-
-if [ $# != 4 ]; then
- error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-# This is SGMM with speaker vectors, on top of LDA+[something] features.
-# Any speaker-specific transforms are obtained from the alignment directory.
-# To be run from ..
-
-phndim=$1
-spkdim=$2
-ubm=$3
-dir=$4
-
-[ -f $ubm ] || error_exit "UBM file '$ubm' does not exist"
-mkdir -p $dir/log || error_exit "Cannot create '$dir/log'"
-
-# (1): Model initialization; training graph and initial alignment generation.
-for L in $LANGUAGES; do
-(
- data=data/$L/train
- lang=data/$L/lang
- alidir=exp/$L/tri2a_ali
- wdir=$dir/$L
- oov_sym=`cat $lang/oov.txt`
- mkdir -p $wdir/log || error_exit "Cannot create working directory '$wdir'"
-
- # Initialize the model (removed the --spk-space-dim option)
- if [ $stage -le -5 ]; then
- echo "$L: Initializing model"
- submit_jobs.sh "$qcmd" --log=$wdir/log/init_sgmm.log $sjopts \
- sgmm-init --phn-space-dim=$phndim $lang/topo $wdir/tree $ubm \
- $wdir/0.mdl || { touch $dir/err; error_exit "$L: SGMM init failed."; }
- fi
-
- # Make training graphs
- if [ $stage -le -4 ]; then
- echo "$L: Compiling training graphs"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/mkgraphs.TASK_ID.log \
- $sjopts compile-train-graphs $wdir/tree $wdir/0.mdl $lang/L.fst \
- "ark:sym2int.pl --map-oov '$oov_sym' --ignore-first-field $lang/words.txt < $data/split$nj/TASK_ID/text |" \
- "ark:|gzip -c >$wdir/TASK_ID.fsts.gz" \
- || { touch $dir/err; error_exit "$L: Error compiling training graphs"; }
- fi
-
- if [ $stage -le -3 ]; then
- echo "$L: Converting alignments"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/convert.TASK_ID.log \
- $sjopts convert-ali $alidir/final.mdl $wdir/0.mdl $wdir/tree \
- "ark:gunzip -c $alidir/TASK_ID.ali.gz|" \
- "ark:|gzip -c >$wdir/TASK_ID.ali.gz" \
- || { touch $dir/err; error_exit "$L: Convert alignment failed."; }
- fi
-
- if [ $stage -le -2 ]; then
- echo "$L: Computing cepstral mean and variance statistics"
- submit_jobs.sh "$qcmd" --njobs=$nj $sjopts --log=$wdir/log/cmvn.TASK_ID.log \
- compute-cmvn-stats --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \
- scp:$data/split$nj/TASK_ID/feats.scp ark:$wdir/TASK_ID.cmvn \
- || { touch $dir/err; error_exit "$L: Computing CMN/CVN stats failed."; }
- fi
-
- feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-
- if [ $stage -le -1 ]; then
- echo "$L: Doing Gaussian selection"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/gselectTASK_ID.log \
- $sjopts sgmm-gselect $wdir/0.mdl "$feats" "ark,t:|gzip -c > $wdir/TASK_ID.gselect.gz" \
- || { touch $dir/err; error_exit "$L: Error doing Gaussian selection"; }
- fi
-)& # Run the language-specific initializations in parallel
-done
-wait
-[ -f $dir/err ] && { rm $dir/err; error_exit "Error initializing models."; }
-
-# Language independent constants
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-numiters_alimdl=3 # Number of iterations for estimating alignment model.
-incsub_interval=8 # increase substates every 8 iterations
-# total substates after each such increment
-total_substates=( 5000 7000 9000 12000 16000 20000 25000 30000 35000 40000 )
-# For a given number of substates, iterate for $incsub_interval iterations
-numiters=$[(${#total_substates[@]}+1)*$incsub_interval]
-realign_interval=4 # realign every 4 iterations
-spkvec_start=8 # use speaker subspace *after* 8 iterations
-spkvec_interval=2 # reestimate the speaker vectors every 2 iterations
-randprune=0.1
-
-# Initially don't have speaker vectors, but change this after we estimate them.
-spkvecs_gen=0
-
-x=0
-while [ $x -lt $numiters ]; do
- if [ $x -eq 0 ]; then
- flags=v # On first iter, don't update M or N.
- elif [ $spkdim -gt 0 -a $[$x%2] -eq 0 -a $x -gt $spkvec_start ]; then
- # Update N on odd iterations after 1st spkvec iter, if we have spk-space.
- flags=NwSvct
- else # Else update M but not N.
- flags=MwSvct
- fi
-
- if [ $stage -le $x ]; then
- echo "Pass $x: update flags = '$flags' "
- multi_est_opts='' # Will contain model, acc, model-out, occs-out tuples
- for L in $LANGUAGES; do
- (
- data=data/$L/train
- lang=data/$L/lang
- wdir=$dir/$L
- feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
- gselect_opt="--gselect=ark,s,cs:gunzip -c $wdir/TASK_ID.gselect.gz|"
- if [ $spkdim -gt 0 -a $spkvecs_gen -eq 1 ]; then
- spkvecs_opt="--spk-vecs=ark:$wdir/TASK_ID.vecs"
- else
- spkvecs_opt=''
- fi
- silphonelist=`cat $lang/silphones.csl`
-# numsubstates=`cat $wdir/numleaves` # Initial #-substates.
-
- if [ $[$x%$realign_interval] -eq 0 -a $x -gt 0 ]; then
- echo "$L; iter $x: Aligning data"
- submit_jobs.sh "$qcmd" $sjopts --log=$wdir/log/align.$x.TASK_ID.log \
- --njobs=$nj sgmm-align-compiled $spkvecs_opt $scale_opts \
- "$gselect_opt" --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \
- --beam=8 --retry-beam=40 $wdir/$x.mdl \
- "ark:gunzip -c $wdir/TASK_ID.fsts.gz|" "$feats" \
- "ark:|gzip -c >$wdir/TASK_ID.ali.gz" || \
- { touch $dir/err; error_exit "$L, it $x: Error realigning data"; }
- fi
-
- if [ $spkdim -gt 0 -a $x -gt $spkvec_start \
- -a $[$x%$spkvec_interval] -eq 0 ]; then
- echo "$L; iter $x: Computing speaker vectors"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/spkvecs.$x.TASK_ID.log \
- $sjopts ali-to-post "ark:gunzip -c $wdir/TASK_ID.ali.gz|" ark:- \| \
- weight-silence-post 0.01 $silphonelist $wdir/$x.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \
- $spkvecs_opt "$gselect_opt" --rand-prune=$randprune $wdir/$x.mdl \
- "$feats" ark,s,cs:- ark:$wdir/tmpTASK_ID.vecs || \
- { touch $dir/err; error_exit "$L, it $x: Error computing spkvecs"; }
- for n in `seq 1 $nj`; do
- mv $wdir/tmp${n}.vecs $wdir/${n}.vecs;
- done
- spkvecs_gen=1
- fi
-
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/acc.$x.TASK_ID.log \
- $sjopts sgmm-acc-stats --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \
- --update-flags=$flags --rand-prune=$randprune $spkvecs_opt \
- "$gselect_opt" $wdir/$x.mdl "$feats" \
- "ark,s,cs:ali-to-post 'ark:gunzip -c $wdir/TASK_ID.ali.gz|' ark:-|" \
- $wdir/$x.TASK_ID.acc || \
- { touch $dir/err; error_exit "$L, it $x: Error accumulating stats"; }
-
- # Summing accs is quite fast; run locally
- sgmm-sum-accs $wdir/sum.acc $wdir/$x.*.acc || \
- { touch $dir/err; error_exit "$L, it $x: Error summing stats"; }
- ) & # Accumulate in parallel for different languages
- wdir=$dir/$L
- multi_est_opts="$multi_est_opts $wdir/$x.mdl $wdir/sum.acc $wdir/$[$x+1].mdl $wdir/$[$x+1].occs"
- done
- wait
- [ -f $dir/err ] && \
- { rm $dir/err; error_exit "Iter $x: Error in accumulation"; }
-
- add_dim_opts=''
- if [ $x -eq $spkvec_start ]; then
- add_dim_opts="--increase-spk-dim=$spkdim --increase-phn-dim=$phndim"
- elif [ $x -eq $[$spkvec_start*2] ]; then
- add_dim_opts="--increase-spk-dim=$spkdim --increase-phn-dim=$phndim"
- fi
- split_opts=''
- if [ $[$x%$incsub_interval] -eq 1 -a $x -gt 1 ]; then
- index=$[($x/$incsub_interval)-1]
- numsubstates=${total_substates[$index]}
- split_opts="--split-substates=$numsubstates"
- fi
-
- submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log $sjopts \
- sgmm-est-multi --update-flags=$flags $split_opts $add_dim_opts \
- $multi_est_opts || error_exit "Error in pass $x estimation."
-
- # If using speaker vectors, estimate alignment model without spkvecs
- if [ $[$x%$incsub_interval] -eq 0 -a $x -gt 0 ]; then
- chmod -w $dir/??/$x.mdl $dir/??/$x.occs # Preserve for scoring
- [ $spkdim -gt 0 ] && est_alimodel $x;
- else
- rm -f $dir/??/$x.mdl $dir/??/$x.occs
- fi
- rm -f $dir/??/$x.*.acc $dir/??/sum.acc
- fi # End of current stage
- x=$[$x+1];
-done
-
-for L in $LANGUAGES; do
- (
- wdir=$dir/$L
- rm -f $wdir/final.mdl $wdir/final.occs;
- chmod -w $wdir/$x.mdl $wdir/$x.occs # Preserve for scoring
- ln -s $wdir/$x.mdl $wdir/final.mdl;
- ln -s $wdir/$x.occs $wdir/final.occs;
- # If using speaker vectors, estimate alignment model without spkvecs
- [ $spkdim -gt 0 ] && est_alimodel $wdir/$x.mdl;
- rm -f $wdir/final.alimdl;
- ln -sf $wdir/$x.alimdl $wdir/final.alimdl;
-
- # Print out summary of the warning messages.
- for x in $wdir/log/*.log; do
- n=`grep WARNING $x | wc -l`;
- if [ $n -ne 0 ]; then echo "$n warnings in $x"; fi;
- done
- )
-done
-
-echo Done
diff --git a/egs/gp/s1/path.sh b/egs/gp/s1/path.sh
index a38149ac899b8449477e935979d661603e09c279..cee9bacbde96ba565a083082cb96fc012bc7e8d8 100644 (file)
--- a/egs/gp/s1/path.sh
+++ b/egs/gp/s1/path.sh
KALDISRC=$KALDIROOT/src
KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin
KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
-KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/lm
+KALDIBIN=$KALDIBIN:$KALDISRC/sgmm2bin:$KALDISRC/lm
FSTBIN=$KALDIROOT/tools/openfst/bin
LMBIN=$KALDIROOT/tools/irstlm/bin
diff --git a/egs/gp/s1/steps/decode_sgmm_deltas.sh b/egs/gp/s1/steps/decode_sgmm_deltas.sh
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Arnab Ghoshal
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# Decoding script for SGMM using standard MFCC/PLP + delta + acceleration
-# features.
-
-# assumes you are using speaker vectors [for no vectors, see
-# decode_sgmm_novec_lda_etc.sh, if it exists already].
-# if this includes speaker-specific transforms, you have to provide an "old"
-# decoding directory where the transforms are located. The data decoded in
-# that directory must be split up in the same way as the current directory.
-
-function error_exit () {
- echo -e "$@" >&2; exit 1;
-}
-
-function file_exists () {
- [ -f $1 ] || error_exit "$PROG: no such file '$1'"
-}
-
-function readposint () { # Strictly speaking, reading non-negative integers
- local retval=${1/#*=/}; # In case --switch=ARG format was used
- [[ "$retval" =~ ^[0-9]*$ ]] \
- || error_exit "Argument \"$retval\" not a non-negative integer."
- echo $retval
-}
-
-beam=13.0
-nj=1 # Default total number of jobs
-jobid=0 # Default job number
-qcmd="" # Options for the submit_jobs.sh script
-sjopts="" # Options for the submit_jobs.sh script
-use_spkvecs='' # Not expecting a model with speaker vectors, by default.
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <graph-dir> <data-dir> <decode-dir> [<transform-dir>]\n
-e.g.: $PROG -j 10 0 exp/sgmm3c/graph_tgpr data/test_dev93 exp/sgmm3c/decode_dev93_tgpr exp/tri2b/decode_dev93_tgpr\n\n
-Options:\n
- --help\t\tPrint this message and exit.\n
- --beam FLOAT\tDecoding beam (default=$beam).\n
- -j INT INT\tNumber of parallel jobs to run (default=$nj) and current jobid.\n
- --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
- --sjopts STRING\tOptions for the 'submit_jobs.sh' script.\n
- --with-spkvecs\tModel has speaker vectors; do 2-pass decoding.\n
-";
-
-while [ $# -gt 0 ]; do
- case "${1# *}" in # ${1# *} strips any leading spaces from the arguments
- --help) echo -e $usage; exit 0 ;;
- --beam) beam=$2; shift 2 ;;
- -j) nj=`readposint $2`; jobid=`readposint $3`; shift 3 ;;
- --qcmd) qcmd=" --qcmd=${2}"; shift 2 ;;
- --sjopts) sjopts="$2"; shift 2 ;;
- --with-spkvecs) use_spkvecs=1; shift ;;
- -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
- *) break ;; # end of options: interpreted as num-leaves
- esac
-done
-
-if [ $# -lt 3 -o $# -gt 4 ]; then
- error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-graphdir=$1
-data=$2
-dir=$3
-transdir=$4
-acwt=0.1 # Just a default value, used for adaptation and beam-pruning..
-
-srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
-
-mkdir -p $dir
-
-if [ $nj -gt 1 ]; then
- mydata=$data/split$nj/$jobid
-else
- mydata=$data
-fi
-
-requirements="$mydata/feats.scp $srcdir/final.mdl $graphdir/HCLG.fst"
-[ -z "$use_spkvecs" ] || requirements=$requirements" $srcdir/final.alimdl"
-for f in $requirements; do
- file_exists $f
-done
-
-if [ ! -z "$transdir" ]; then # "$transdir" nonempty..
- file_exists $transdir/$n.trans
-fi
-
-feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-[ ! -z "$transdir" ] && feats="$feats transform-feats --utt2spk=ark:$mydata/utt2spk ark:$transdir/$jobid.trans ark:- ark:- |"
-
-
-# Do Gaussian selection, since we'll have two decoding passes and don't want to
-# redo this. Note: it doesn't make a difference if we use final.mdl or
-# final.alimdl, they have the same UBM.
-sgmm-gselect $srcdir/final.mdl "$feats" "ark:|gzip -c >$dir/$jobid.gselect.gz" \
- 2>$dir/gselect$jobid.log \
- || error_exit "Error in Gaussian selection.";
-gselect_opt="--gselect=ark:gunzip -c $dir/$jobid.gselect.gz|"
-
-target_lat="$dir/lat.$jobid.gz"
-[ -z "$use_spkvecs" ] || target_lat="$dir/pre_lat.$jobid.gz"
-align_model="$srcdir/final.mdl"
-[ -z "$use_spkvecs" ] || align_model="$srcdir/final.alimdl"
-
-# Generate a state-level lattice for rescoring, with the alignment model and no
-# speaker vectors.
-
-sgmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \
- --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
- --word-symbol-table=$graphdir/words.txt "$gselect_opt" $align_model \
- $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $target_lat" \
- 2> $dir/decode_pass1.$jobid.log \
- || error_exit "Error in 1st-pass decoding.";
-
-# Do a second pass "decoding" if using speaker vectors.
-if [ ! -z "$use_spkvecs" ]; then
- silphonelist=`cat $graphdir/silphones.csl` || exit 1
- ( lattice-determinize --acoustic-scale=$acwt --prune=true --beam=4.0 \
- "ark:gunzip -c $dir/pre_lat.$jobid.gz|" ark:- \
- | lattice-to-post --acoustic-scale=$acwt ark:- ark:- \
- | weight-silence-post 0.0 $silphonelist $srcdir/final.alimdl ark:- ark:- \
- | sgmm-post-to-gpost "$gselect_opt" $srcdir/final.alimdl "$feats" ark:- \
- ark:- \
- | sgmm-est-spkvecs-gpost --spk2utt=ark:$mydata/spk2utt $srcdir/final.mdl \
- "$feats" ark:- "ark:$dir/$jobid.vecs"
- ) 2> $dir/vecs.$jobid.log \
- || error_exit "Error estimating speaker vectors.";
-
- # Now rescore the state-level lattices with the adapted features and the
- # corresponding model. Prune and determinize the lattices to limit their size.
-
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$mydata/utt2spk \
- --spk-vecs=ark:$dir/$jobid.vecs $srcdir/final.mdl \
- "ark:gunzip -c $dir/pre_lat.$jobid.gz|" "$feats" \
- "ark:|lattice-determinize --acoustic-scale=$acwt --prune=true --beam=6.0 ark:- ark:- | gzip -c > $dir/lat.$jobid.gz" \
- 2>$dir/rescore.$jobid.log \
- || error_exit "Error in 2nd-pass rescoring.";
-
- rm $dir/pre_lat.$jobid.gz
- # The top-level decoding script rescores "lat.$jobid.gz" to get final output.
-fi
-
diff --git a/egs/gp/s1/steps/train_sgmm_deltas.sh b/egs/gp/s1/steps/train_sgmm_deltas.sh
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/bin/bash
-
-# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This is Subspace Gaussian Mixture Model (SGMM) training--
-# see "The subspace Gaussian mixture model--A structured model for speech recognition"
-# by D. Povey et al, Computer Speech and Language, 2011.
-
-function error_exit () {
- echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
- local retval=${1/#*=/}; # In case --switch=ARG format was used
- retval=${retval#0*} # Strip any leading 0's
- [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
- || error_exit "Argument \"$retval\" not an integer."
- echo $retval
-}
-
-nj=4 # Default number of jobs
-stage=-4 # Default starting stage (start with tree building)
-qcmd="" # Options for the submit_jobs.sh script
-sjopts="" # Options for the submit_jobs.sh script
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <num-substates> <phone-dim> <spk-dim> <data-dir> <lang-dir> <ali-dir> <ubm>\n
-e.g.: $PROG 10000 40 39 data/train data/lang exp/tri2a_ali exp/ubm3c/final.ubm exp/sgmm3c\n\n
-Options:\n
- --help\t\tPrint this message and exit\n
- --num-jobs INT\tNumber of parallel jobs to run (default=$nj).\n
- --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
- --sjopts STRING\tOptions for the 'submit_jobs.sh' script\n
- --stage INT\tStarting stage (e.g. -4 for SGMM init; 2 for iter 2; default=$stage)\n
-";
-
-while [ $# -gt 0 ]; do
- case "${1# *}" in # ${1# *} strips any leading spaces from the arguments
- --help) echo -e $usage; exit 0 ;;
- --num-jobs)
- shift; nj=`readint $1`;
- [ $nj -lt 1 ] && error_exit "--num-jobs arg '$nj' not positive.";
- shift ;;
- --qcmd)
- shift; qcmd=" --qcmd=${1}"; shift ;;
- --sjopts)
- shift; sjopts="$1"; shift ;;
- --stage)
- shift; stage=`readint $1`; shift ;;
- -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
- *) break ;; # end of options: interpreted as num-leaves
- esac
-done
-
-if [ $# != 8 ]; then
- error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-# This is SGMM with speaker vectors, on top of LDA+[something] features.
-# Any speaker-specific transforms are obtained from the alignment directory.
-# To be run from ..
-
-totsubstates=$1
-phndim=$2
-spkdim=$3
-data=$4
-lang=$5
-alidir=$6
-ubm=$7
-dir=$8
-
-mkdir -p $dir || exit 1;
-
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-
-numiters=25 # Total number of iterations
-numiters_alimdl=3 # Number of iterations for estimating alignment model.
-maxiterinc=15 # Last iter to increase #substates on.
-realign_iters="5 10 15";
-spkvec_iters="5 8 12 17"
-add_dim_iters="6 8 10 12"; # Iters on which to increase phn dim and/or spk dim,
- # if necessary, In most cases, either none of these or only the first of these
- # will have any effect (we increase in increments of [feature dim])
-
-oov_sym=`cat $lang/oov.txt`
-silphonelist=`cat $lang/silphones.csl`
-
-numsubstates=`cat $dir/numleaves` # Initial #-substates.
-# per-iter increment for #substates
-incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc]
-
-# Initially don't have speaker vectors, but change this after we estimate them.
-spkvecs_opt=
-gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/TASK_ID.gselect.gz|"
-
-randprune=0.1
-mkdir -p $dir/log
-
-featspart="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$alidir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-if [ ! -f $ubm ]; then
- echo "No UBM in $ubm"
- exit 1;
-fi
-
-if [ $stage -le -4 ]; then
- submit_jobs.sh "$qcmd" --log=$dir/log/init_sgmm.log $sjopts \
- sgmm-init --phn-space-dim=$phndim --spk-space-dim=$spkdim $lang/topo \
- $dir/tree $ubm $dir/0.mdl || error_exit "SGMM init failed."
-fi
-
-if [ $stage -le -3 ]; then
-# Make training graphs (this is split in $nj parts).
- echo "Compiling training graphs"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/compile_graphsTASK_ID.log \
- $sjopts compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \
- "ark:sym2int.pl --map-oov '$oov_sym' --ignore-first-field $lang/words.txt < $data/split$nj/TASK_ID/text |" \
- "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \
- || error_exit "Error compiling training graphs"
-fi
-
-if [ $stage -le -2 ]; then
- echo "Doing Gaussian selection"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/gselectTASK_ID.log \
- $sjopts sgmm-gselect $dir/0.mdl "$featspart" "ark,t:|gzip -c > $dir/TASK_ID.gselect.gz" \
- || error_exit "Error doing Gaussian selection"
-fi
-
-
-if [ $stage -le -1 ]; then
- echo "Converting alignments" # don't bother parallelizing; very fast.
- for n in `seq 1 $nj`; do
- convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree \
- "ark:gunzip -c $alidir/$n.ali.gz|" "ark:|gzip -c >$dir/$n.ali.gz" \
- 2>$dir/log/convert.$n.log
- done
-fi
-
-x=0
-while [ $x -lt $numiters ]; do
- if [ $x -eq 0 ]; then
- flags=vwcSt # On first iter, don't update M or N.
- elif [ $spkdim -gt 0 -a $[$x%2] -eq 1 -a \
- $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then
- # Update N on odd iterations after 1st spkvec iter, if we have spk-space.
- flags=vNwcSt
- else # Else update M but not N.
- flags=vMwcSt
- fi
-
- if [ $stage -le $x ]; then
- echo "Pass $x: update flags = '$flags' "
- if echo $realign_iters | grep -w $x >/dev/null; then
- echo "Aligning data"
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/align.$x.TASK_ID.log \
- $sjopts sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \
- --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk --beam=8 --retry-beam=40 \
- $dir/$x.mdl "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
- "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
- || error_exit "Error realigning data on iter $x"
- fi
-
- if [ $spkdim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/spkvecs.$x.TASK_ID.log \
- $sjopts ali-to-post "ark:gunzip -c $dir/TASK_ID.ali.gz|" ark:- \| \
- weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \
- $spkvecs_opt "$gselect_opt" --rand-prune=$randprune $dir/$x.mdl \
- "$featspart" ark,s,cs:- ark:$dir/tmpTASK_ID.vecs \
- || error_exit "Error computing speaker vectors on iter $x"
- for n in `seq 1 $nj`; do
- mv $dir/tmp${n}.vecs $dir/${n}.vecs;
- done
- spkvecs_opt="--spk-vecs=ark:$dir/TASK_ID.vecs"
- fi
-
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/acc.$x.TASK_ID.log \
- $sjopts sgmm-acc-stats --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \
- --update-flags=$flags --rand-prune=$randprune $spkvecs_opt \
- "$gselect_opt" $dir/$x.mdl "$featspart" \
- "ark,s,cs:ali-to-post 'ark:gunzip -c $dir/TASK_ID.ali.gz|' ark:-|" \
- $dir/$x.TASK_ID.acc || error_exit "Error accumulating stats on iter $x"
-
- add_dim_opts=
- if echo $add_dim_iters | grep -w $x >/dev/null; then
- add_dim_opts="--increase-phn-dim=$phndim --increase-spk-dim=$spkdim"
- fi
-
- submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log $sjopts \
- sgmm-est --update-flags=$flags --split-substates=$numsubstates \
- $add_dim_opts --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
- "sgmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl \
- || error_exit "Error in pass $x estimation."
-
- rm -f $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
- fi
-
- if [ $x -lt $maxiterinc ]; then
- numsubstates=$[$numsubstates+$incsubstates]
- fi
- x=$[$x+1];
-done
-
-( cd $dir; rm final.mdl final.occs 2>/dev/null;
- ln -s $x.mdl final.mdl;
- ln -s $x.occs final.occs )
-
-if [ $spkdim -gt 0 ]; then
- # If we have speaker vectors, we need an alignment model.
- # The point of this last phase of accumulation is to get Gaussian-level
- # alignments with the speaker vectors but accumulate stats without
- # any speaker vectors; we re-estimate M, w, c and S to get a model
- # that's compatible with not having speaker vectors.
-
- # We do this for a few iters, in this recipe.
- cur_alimdl=$dir/$x.mdl
- y=0;
- while [ $y -lt $numiters_alimdl ]; do
- echo "Pass $y of building alignment model"
- if [ $y -eq 0 ]; then
- flags=MwcS # First time don't update v...
- else
- flags=vMwcS # don't update transitions-- will probably share graph with normal model.
- fi
-
- if [ $stage -le $[$y+100] ]; then
- submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/acc_ali.$y.TASK_ID.log \
- $sjopts ali-to-post "ark:gunzip -c $dir/TASK_ID.ali.gz|" ark:- \| \
- sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \
- --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk $dir/$x.mdl \
- "$featspart" ark,s,cs:- ark:- \| \
- sgmm-acc-stats-gpost --update-flags=$flags $cur_alimdl "$featspart" \
- ark,s,cs:- $dir/$y.TASK_ID.aliacc \
- || error_exit "Error accumulating stats for alignment model on iter $y"
-
- submit_jobs.sh "$qcmd" --log=$dir/log/update_ali.$y.log $sjopts \
- sgmm-est --update-flags=$flags --remove-speaker-space=true \
- $cur_alimdl "sgmm-sum-accs - $dir/$y.*.aliacc|" $dir/$[$y+1].alimdl \
- || error_exit "Error estimating alignment model on iter $y";
- rm $dir/$y.*.aliacc || exit 1;
- [ $y -gt 0 ] && rm $dir/$y.alimdl
- fi
- cur_alimdl=$dir/$[$y+1].alimdl
- y=$[$y+1]
- done
- (cd $dir; rm final.alimdl 2>/dev/null; ln -s $y.alimdl final.alimdl )
-fi
-
-# Print out summary of the warning messages.
-for x in $dir/log/*.log; do
- n=`grep WARNING $x | wc -l`;
- if [ $n -ne 0 ]; then echo $n warnings in $x; fi;
-done
-
-echo Done
diff --git a/egs/gp/s5/path.sh b/egs/gp/s5/path.sh
index af75fa50c1b4511fe8734ccc0c5cc7b403f3a4e0..e9f7a8337bca8d06d8f19231b0b24b08630a681f 100644 (file)
--- a/egs/gp/s5/path.sh
+++ b/egs/gp/s5/path.sh
KALDISRC=$KALDI_ROOT/src
KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin
KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
-KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/lm
+KALDIBIN=$KALDIBIN:$KALDISRC/sgmm2bin:$KALDISRC/lm
FSTBIN=$KALDI_ROOT/tools/openfst/bin
LMBIN=$KALDI_ROOT/tools/irstlm/bin
diff --git a/egs/gp/s5/run.sh b/egs/gp/s5/run.sh
index e563bdff0d19b0ff3fea0918c6131e0a61edcefa..8054d02988d16fb665d1c92aecfb817569827efd 100755 (executable)
--- a/egs/gp/s5/run.sh
+++ b/egs/gp/s5/run.sh
num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
mkdir -p exp/$L/sgmm2a
- steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \
+ steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \
$num_states $num_substates data/$L/train data/$L/lang exp/$L/tri1_ali \
exp/$L/ubm2a/final.ubm exp/$L/sgmm2a >& exp/$L/sgmm2a/train.log
mkdir -p exp/$L/sgmm2b
- steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 \
+ steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 \
$num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali \
exp/$L/ubm2a/final.ubm exp/$L/sgmm2b >& exp/$L/sgmm2b/train.log
) &
$highmem_cmd $graph_dir/mkgraph.log \
utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/$sgmm $graph_dir
- steps/decode_sgmm.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
+ steps/decode_sgmm2.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
exp/$L/$sgmm/decode_dev_${lm_suffix}
) &
done # loop over LMs
diff --git a/egs/lre07/v2/path.sh b/egs/lre07/v2/path.sh
index 7cf73af8c53d9ee583758f059704e6ad4dc3fca2..d55f970d1fb7a459c4bfba877fed2d8b2e304bec 100755 (executable)
--- a/egs/lre07/v2/path.sh
+++ b/egs/lre07/v2/path.sh
export KALDI_ROOT=$(cd ../../..; pwd)
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
export LC_ALL=C
index ed17b628f479de289e36a3c1a3f20db6303d9818..7ff2bd975e1aac1442109c3b605a74eda3dc10af 100755 (executable)
# local/run_raw_fmllr.sh
# You don't have to run all 3 of the below, e.g. you can just run the run_sgmm2.sh
-#local/run_sgmm.sh
local/run_sgmm2.sh
#local/run_sgmm2x.sh
diff --git a/egs/rm/s5/local/run_sgmm.sh b/egs/rm/s5/local/run_sgmm.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-. cmd.sh
-
-## SGMM on top of LDA+MLLT+SAT features.
-if [ ! -f exp/ubm4a/final.mdl ]; then
- steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 400 data/train data/lang exp/tri3b_ali exp/ubm4a || exit 1;
-fi
-steps/train_sgmm.sh --cmd "$train_cmd" 2500 7500 data/train data/lang exp/tri3b_ali exp/ubm4a/final.ubm exp/sgmm4a || exit 1;
-
-utils/mkgraph.sh data/lang exp/sgmm4a exp/sgmm4a/graph || exit 1;
-
-steps/decode_sgmm.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
- --transform-dir exp/tri3b/decode exp/sgmm4a/graph data/test exp/sgmm4a/decode || exit 1;
-
-steps/decode_sgmm.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
- --transform-dir exp/tri3b/decode exp/sgmm4a/graph data/test exp/sgmm4a/decode_fmllr || exit 1;
-
- # Now we'll align the SGMM system to prepare for discriminative training.
- steps/align_sgmm.sh --nj 8 --cmd "$train_cmd" --transform-dir exp/tri3b \
- --use-graphs true --use-gselect true data/train data/lang exp/sgmm4a exp/sgmm4a_ali || exit 1;
- steps/make_denlats_sgmm.sh --nj 8 --sub-split 20 --cmd "$decode_cmd" --transform-dir exp/tri3b \
- data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri3b --boost 0.2 \
- data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats exp/sgmm4a_mmi_b0.2
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri3b/decode data/lang data/test exp/sgmm4a/decode exp/sgmm4a_mmi_b0.2/decode_it$iter &
- done
-
-wait
-steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1;
-steps/decode_combine.sh data/test data/lang exp/sgmm4a/decode exp/tri3b_mmi/decode exp/combine_4a_3b/decode || exit 1;
-# combining the sgmm run and the best MMI+fMMI run.
-steps/decode_combine.sh data/test data/lang exp/sgmm4a/decode exp/tri3b_fmmi_c/decode_it5 exp/combine_4a_3b_fmmic5/decode || exit 1;
-
-steps/decode_combine.sh data/test data/lang exp/sgmm4a_mmi_b0.2/decode_it4 exp/tri3b_fmmi_c/decode_it5 exp/combine_4a_mmi_3b_fmmic5/decode || exit 1;
-
diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 00bac326a803ee02a1cd9bce0384bdf65141261e..aa838ceda890f42139770af9520e2989c46ce113 100755 (executable)
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
# local/run_raw_fmllr.sh
-# You don't have to run all 3 of the below, e.g. you can just run the run_sgmm2.sh
-#local/run_sgmm.sh
+# You don't have to run all 2 of the below, e.g. you can just run the run_sgmm2.sh
local/run_sgmm2.sh
#local/run_sgmm2x.sh
diff --git a/egs/sprakbanken/s5/local/run_sgmm.sh b/egs/sprakbanken/s5/local/run_sgmm.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-
-# This script is invoked from ../run.sh
-# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
-
-. cmd.sh
-
-# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
-# training, but this shouldn't have much effect.
-
-(
- steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
- data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
-
- steps/train_ubm.sh --cmd "$train_cmd" \
- 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
-
- steps/train_sgmm.sh --cmd "$train_cmd" \
- 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \
- exp/ubm5a/final.ubm exp/sgmm5a || exit 1;
-
- (
- utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93
- ) &
-
- steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
- --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1;
- steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
- data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
- data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
- exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
- done
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
- --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
- exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
- done
-
-) &
-
-
-(
-# The next commands are the same thing on all the si284 data.
-
-# SGMM system on the si284 data [sgmm5b]
- steps/train_ubm.sh --cmd "$train_cmd" \
- 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
-
- steps/train_sgmm.sh --cmd "$train_cmd" \
- 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
- exp/ubm5b/final.ubm exp/sgmm5b || exit 1;
-
- (
- utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93
- steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
- exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92
-
- utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1;
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
- exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93
- steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
- exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92
- ) &
-
- steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
- --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284
-
- steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
- data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
- data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1
-
- for iter in 1 2 3 4; do
- for test in dev93 eval92; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \
- exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
-
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \
- exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
- done
- done
-) &
-
-
-
-# Train quinphone SGMM system.
-
-steps/train_sgmm.sh --cmd "$train_cmd" \
- --context-opts "--context-width=5 --central-position=2" \
- 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
- exp/ubm5b/final.ubm exp/sgmm5c || exit 1;
-
-# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93.
-steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93
diff --git a/egs/swbd/s5/local/run_sgmm.sh b/egs/swbd/s5/local/run_sgmm.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-. cmd.sh
-
-
-# Build a SGMM system on just the 100k_nodup data, on top of LDA+MLLT+SAT.
-if [ ! -f exp/ubm5a/final.ubm ]; then
- steps/train_ubm.sh --cmd "$train_cmd" 700 data/train_100k_nodup data/lang \
- exp/tri4a_ali_100k_nodup exp/ubm5a || exit 1;
-fi
-steps/train_sgmm.sh --cmd "$train_cmd" \
- 4500 40000 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup \
- exp/ubm5a/final.ubm exp/sgmm5a || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/sgmm5a exp/sgmm5a/graph || exit 1;
-
-steps/decode_sgmm.sh --cmd "$decode_cmd" --config conf/decode.config \
- --nj 30 --transform-dir exp/tri4a/decode_eval2000 \
- exp/sgmm5a/graph data/eval2000 exp/sgmm5a/decode_eval2000
-
- # Now discriminatively train the SGMM system on 100k_nodup data.
-steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4a_ali_100k_nodup \
- --use-graphs true --use-gselect true data/train_100k_nodup data/lang exp/sgmm5a exp/sgmm5a_ali_100k_nodup
-
- # Took the beam down to 10 to get acceptable decoding speed.
-steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --beam 9.0 --lattice-beam 6 --cmd "$decode_cmd" \
- --transform-dir exp/tri4a_ali_100k_nodup \
- data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup
-
-steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4a_ali_100k_nodup --boost 0.1 \
- data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup exp/sgmm5a_mmi_b0.1
-
-for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4a/decode_eval2000 data/lang_test data/eval2000 exp/sgmm5a/decode_eval2000 \
- exp/sgmm5a_mmi_b0.1/decode_eval2000_it$iter &
-done
-
diff --git a/egs/swbd/s5/run.sh b/egs/swbd/s5/run.sh
index 7286938b2905a95daa0ee7c2dfb6ef54598d02ac..d61b818fe1b169e3b813d9597530dc6bdf1e1141 100755 (executable)
--- a/egs/swbd/s5/run.sh
+++ b/egs/swbd/s5/run.sh
-#local/run_sgmm.sh
local/run_sgmm2.sh
# Building a larger SAT system.
index 5778d017529ca1ace7091c9a5f375278bb53aa9c..8aff7e40c66c5a7a29d46c435b7982ab38e4d3ab 100755 (executable)
--- a/egs/swbd/s5/run_edin.sh
+++ b/egs/swbd/s5/run_edin.sh
# TODO(arnab): add SGMM and hybrid
-# local/run_sgmm.sh
+# local/run_sgmm2.sh
# # Recipe with DNN system on top of fMLLR features
# local/run_hybrid.sh
diff --git a/egs/swbd/s5b/local/run_sgmm.sh b/egs/swbd/s5b/local/run_sgmm.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-. cmd.sh
-
-
-# Build a SGMM system on just the 100k_nodup data, on top of LDA+MLLT+SAT.
-if [ ! -f exp/ubm5a/final.ubm ]; then
- steps/train_ubm.sh --cmd "$train_cmd" 700 data/train_100k_nodup data/lang \
- exp/tri4a_ali_100k_nodup exp/ubm5a || exit 1;
-fi
-steps/train_sgmm.sh --cmd "$train_cmd" \
- 4500 40000 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup \
- exp/ubm5a/final.ubm exp/sgmm5a || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/sgmm5a exp/sgmm5a/graph || exit 1;
-
-steps/decode_sgmm.sh --cmd "$decode_cmd" --config conf/decode.config \
- --nj 30 --transform-dir exp/tri4a/decode_eval2000 \
- exp/sgmm5a/graph data/eval2000 exp/sgmm5a/decode_eval2000
-
- # Now discriminatively train the SGMM system on 100k_nodup data.
-steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4a_ali_100k_nodup \
- --use-graphs true --use-gselect true data/train_100k_nodup data/lang exp/sgmm5a exp/sgmm5a_ali_100k_nodup
-
- # Took the beam down to 10 to get acceptable decoding speed.
-steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --beam 9.0 --lattice-beam 6 --cmd "$decode_cmd" \
- --transform-dir exp/tri4a_ali_100k_nodup \
- data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup
-
-steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4a_ali_100k_nodup --boost 0.1 \
- data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup exp/sgmm5a_mmi_b0.1
-
-for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4a/decode_eval2000 data/lang_test data/eval2000 exp/sgmm5a/decode_eval2000 \
- exp/sgmm5a_mmi_b0.1/decode_eval2000_it$iter &
-done
-
index e582fdc47e84853ee93a017a0d9650aa754c6570..f54d95d60a851a244aaee2a770d89e65c6666568 100755 (executable)
kaldisrc=`pwd`/../../../src
openfst=`pwd`/../../../tools/openfst/
-export PATH=$kaldisrc/bin:$kaldisrc/fgmmbin:$kaldisrc/gmmbin:$kaldisrc/nnetbin:$kaldisrc/sgmm2bin:$kaldisrc/featbin:$kaldisrc/fstbin:$kaldisrc/latbin:$kaldisrc/onlinebin:$kaldisrc/sgmmbin:$kaldisrc/onl-rec:$openfst/bin:"$PATH"
+export PATH=$kaldisrc/bin:$kaldisrc/fgmmbin:$kaldisrc/gmmbin:$kaldisrc/nnetbin:$kaldisrc/sgmm2bin:$kaldisrc/featbin:$kaldisrc/fstbin:$kaldisrc/latbin:$kaldisrc/onlinebin:$kaldisrc/onl-rec:$openfst/bin:"$PATH"
export LD_LIBRARY_PATH=$kaldisrc/onl-rec:$kaldisrc/pykaldi/kaldi:$openfst/lib:$openfst/lib/fst:$LD_LIBRARY_PATH
export PYTHONPATH=$kaldisrc/pykaldi:$kaldisrc/pykaldi/pyfst:$PYTHONPATH
diff --git a/egs/wsj/s5/local/run_sgmm.sh b/egs/wsj/s5/local/run_sgmm.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-
-# This script is invoked from ../run.sh
-# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity.
-
-. cmd.sh
-
-# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for
-# training, but this shouldn't have much effect.
-
-(
- steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
- data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1;
-
- steps/train_ubm.sh --cmd "$train_cmd" \
- 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1;
-
- steps/train_sgmm.sh --cmd "$train_cmd" \
- 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \
- exp/ubm5a/final.ubm exp/sgmm5a || exit 1;
-
- (
- utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93
- ) &
-
- steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \
- --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1;
- steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
- data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
- data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
- exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter &
- done
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
- --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9
-
- for iter in 1 2 3 4; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \
- exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter &
- done
-
-) &
-
-
-(
-# The next commands are the same thing on all the si284 data.
-
-# SGMM system on the si284 data [sgmm5b]
- steps/train_ubm.sh --cmd "$train_cmd" \
- 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1;
-
- steps/train_sgmm.sh --cmd "$train_cmd" \
- 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
- exp/ubm5b/final.ubm exp/sgmm5b || exit 1;
-
- (
- utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93
- steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \
- exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92
-
- utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1;
- steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
- exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93
- steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
- exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92
- ) &
-
- steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
- --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284
-
- steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
- data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284
-
- steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
- data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1
-
- for iter in 1 2 3 4; do
- for test in dev93 eval92; do
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \
- exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
-
- steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \
- --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \
- exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
- done
- done
-) &
-
-
-
-# Train quinphone SGMM system.
-
-steps/train_sgmm.sh --cmd "$train_cmd" \
- --context-opts "--context-width=5 --central-position=2" \
- 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \
- exp/ubm5b/final.ubm exp/sgmm5c || exit 1;
-
-# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93.
-steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
- data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index ca13c1704f28d8d86d88510f79cb4b6904f2771a..fb0041176584b46a20078ccb33a6e88861132ccb 100755 (executable)
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
#local/run_nnet2.sh
-## Segregated some SGMM builds into a separate file.
-#local/run_sgmm.sh
-
# You probably want to run the sgmm2 recipe as it's generally a bit better:
local/run_sgmm2.sh
diff --git a/egs/wsj/s5/steps/align_sgmm.sh b/egs/wsj/s5/steps/align_sgmm.sh
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-# Computes training alignments and (if needed) speaker-vectors, given an
-# SGMM system. If the system is built on top of SAT, you should supply
-# transforms with the --transform-dir option.
-
-# If you supply the --use-graphs option, it will use the training
-# graphs from the source directory.
-
-# Begin configuration section.
-stage=0
-nj=4
-cmd=run.pl
-use_graphs=false # use graphs from srcdir
-use_gselect=false # use gselect info from srcdir [regardless, we use
- # Gaussian-selection info, we might have to compute it though.]
-gselect=15 # Number of Gaussian-selection indices for SGMMs.
-# Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-beam=10
-retry_beam=40
-transform_dir= # directory to find fMLLR transforms in.
-# End configuration options.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f path.sh ] && . ./path.sh # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# != 4 ]; then
- echo "usage: steps/align_sgmm.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
- echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\"
- echo " exp/sgmm4a exp/sgmm5a_ali"
- echo "main options (for others, see top of script file)"
- echo " --config <config-file> # config containing options"
- echo " --nj <nj> # number of parallel jobs"
- echo " --use-graphs true # use graphs in src-dir"
- echo " --transform-dir <transform-dir> # directory to find fMLLR transforms"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- exit 1;
-fi
-
-data=$1
-lang=$2
-srcdir=$3
-dir=$4
-
-oov=`cat $lang/oov.int` || exit 1;
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-sdata=$data/split$nj
-
-mkdir -p $dir/log
-cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
-cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
-echo $nj > $dir/num_jobs
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-cp $srcdir/{tree,final.mdl} $dir || exit 1;
-[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir
-cp $srcdir/final.occs $dir;
-
-## Set up features.
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
- cp $srcdir/final.mat $dir
- ;;
- *) echo "Invalid feature type $feat_type" && exit 1;
-esac
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
- [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
- && echo "$0: #jobs mismatch with transform-dir." && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
- echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
- echo " but you are not providing the --transform-dir option during alignment."
-fi
-##
-
-## Set up model and alignment model.
-mdl=$srcdir/final.mdl
-if [ -f $srcdir/final.alimdl ]; then
- alimdl=$srcdir/final.alimdl
-else
- alimdl=$srcdir/final.mdl
-fi
-[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
-
-## Work out where we're getting the graphs from.
-if $use_graphs; then
- [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
- echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
- [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
- graphdir=$srcdir
- ln.pl $srcdir/fsts.*.gz $dir
-else
- graphdir=$dir
- if [ $stage -le 0 ]; then
- echo "$0: compiling training graphs"
- tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
- $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
- compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl $lang/L.fst "$tra" \
- "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
- fi
-fi
-
-## Work out where we're getting the Gaussian-selection info from
-if $use_gselect; then
- [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
- echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1;
- [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1;
- graphdir=$srcdir
- gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|"
- ln.pl $srcdir/gselect.*.gz $dir
-else
- graphdir=$dir
- if [ $stage -le 1 ]; then
- echo "$0: computing Gaussian-selection info"
- # Note: doesn't matter whether we use $alimdl or $mdl, they will
- # have the same gselect info.
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect --full-gmm-nbest=$gselect $alimdl \
- "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
- fi
- gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
-fi
-
-
-if [ $alimdl == $mdl ]; then
- # Speaker-independent decoding-- just one pass. Not normal.
- T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1;
- [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1;
-
- if [ $stage -le 2 ]; then
- echo "$0: aligning data in $data using model $mdl (no speaker-vectors)"
- $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
- sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \
- "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
- fi
- echo "$0: done aligning data."
- exit 0;
-fi
-
-# Continue with system with speaker vectors.
-if [ $stage -le 2 ]; then
- echo "$0: aligning data in $data using model $alimdl"
- $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
- sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \
- "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
-fi
-
-if [ $stage -le 3 ]; then
- echo "$0: computing speaker vectors (1st pass)"
- $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \
- ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
- weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
- sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \
- sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
- $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
- echo "$0: computing speaker vectors (2nd pass)"
- $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \
- ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
- weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
- --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1;
- rm $dir/pre_vecs.*
-fi
-
-if [ $stage -le 5 ]; then
- echo "$0: doing final alignment."
- $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
- sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \
- --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
- $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-fi
-
-rm $dir/pre_ali.*.gz
-
-echo "$0: done aligning data."
-
-utils/summarize_warnings.pl $dir/log
-
-exit 0;
index 8f68a2f7a08823a55460f36df67dccff7d45a0ed..d2f829f7e3e87fa569237087e2b0821c40e0a61c 100755 (executable)
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
- echo "usage: steps/align_sgmm.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
- echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\"
+ echo "usage: steps/align_sgmm2.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+ echo "e.g.: steps/align_sgmm2.sh --transform-dir exp/tri3b data/train data/lang \\"
echo " exp/sgmm4a exp/sgmm5a_ali"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
diff --git a/egs/wsj/s5/steps/decode_sgmm.sh b/egs/wsj/s5/steps/decode_sgmm.sh
+++ /dev/null
@@ -1,266 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-
-# This script does decoding with an SGMM system, with speaker vectors.
-# If the SGMM system was
-# built on top of fMLLR transforms from a conventional system, you should
-# provide the --transform-dir option.
-
-# Begin configuration section.
-stage=1
-alignment_model=
-transform_dir= # dir to find fMLLR transforms.
-nj=4 # number of decoding jobs.
-acwt=0.1 # Just a default value, used for adaptation and beam-pruning..
-cmd=run.pl
-beam=15.0
-gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note:
- # the first_pass_gselect variable is used for the 1st pass of
- # decoding and can be tighter.
-first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
- # the 1st pass of decoding (lattice generation).
-max_active=7000
-
-#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming
-# in the other scripts
-lattice_beam=6.0 # Beam we use in lattice generation.
-vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
- # speaker-vector computation. Can be quite tight (actually we could
- # probably just do best-path.
-use_fmllr=false
-fmllr_iters=10
-fmllr_min_count=1000
-skip_scoring=false
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
- echo "Usage: steps/decode_sgmm.sh [options] <graph-dir> <data-dir> <decode-dir>"
- echo " e.g.: steps/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
- echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr"
- echo "main options (for others, see top of script file)"
- echo " --transform-dir <decoding-dir> # directory of previous decoding"
- echo " # where we can find transforms for SAT systems."
- echo " --alignment-model <ali-mdl> # Model for the first-pass decoding."
- echo " --config <config-file> # config containing options"
- echo " --nj <nj> # number of parallel jobs"
- echo " --cmd <cmd> # Command to run in parallel with"
- echo " --beam <beam> # Decoding beam; default 13.0"
- exit 1;
-fi
-
-graphdir=$1
-data=$2
-dir=$3
-srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
-
-for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-sdata=$data/split$nj;
-silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
-gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
-
-mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-echo $nj > $dir/num_jobs
-
-
-## Set up features.
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
- [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
- && echo "$0: #jobs mismatch with transform-dir." && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
- echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
- echo " but you are not providing the --transform-dir option in test time."
-fi
-##
-
-## Calculate FMLLR pre-transforms if needed. We are doing this here since this
-## step is requried by models both with and without speaker vectors
-if $use_fmllr; then
- if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
- echo "$0: computing pre-transform for fMLLR computation."
- sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
- fi
-fi
-
-## Save Gaussian-selection info to disk.
-# Note: we can use final.mdl regardless of whether there is an alignment model--
-# they use the same UBM.
-if [ $stage -le 1 ]; then
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
- "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
-fi
-
-## Work out name of alignment model. ##
-if [ -z "$alignment_model" ]; then
- if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
- else alignment_model=$srcdir/final.mdl; fi
-fi
-[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
-
-# Generate state-level lattice which we can rescore. This is done with the
-# alignment model and no speaker-vectors.
-if [ $stage -le 2 ]; then
- if [ -f "$graphdir/num_pdfs" ]; then
- [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \
- { echo "Mismatch in number of pdfs with $alignment_model"; exit 1; }
- fi
- $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
- sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
- --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
- --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \
- $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
-fi
-
-## Check if the model has speaker vectors
-spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`
-
-if [ $spkdim -gt 0 ]; then ### For models with speaker vectors:
-
-# Estimate speaker vectors (1st pass). Prune before determinizing
-# because determinization can take a while on un-pruned lattices.
-# Note: the sgmm-post-to-gpost stage is necessary because we have
-# a separate alignment-model and final model, otherwise we'd skip it
-# and use sgmm-est-spkvecs.
- if [ $stage -le 3 ]; then
- $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
- sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
- sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
- $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
- fi
-
-# Estimate speaker vectors (2nd pass). Since we already have spk vectors,
-# at this point we need to rescore the lattice to get the correct posteriors.
- if [ $stage -le 4 ]; then
- $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --speedup=true --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
- $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
- fi
- rm $dir/pre_vecs.*
-
- if $use_fmllr; then
- # Estimate fMLLR transforms (note: these may be on top of any
- # fMLLR transforms estimated with the baseline GMM system.
- if [ $stage -le 5 ]; then # compute fMLLR transforms.
- echo "$0: computing fMLLR transforms."
- $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --speedup=true --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
- --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
- $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
- fi
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
- fi
-
-# Now rescore the state-level lattices with the adapted features and the
-# corresponding model. Prune and determinize the lattices to limit
-# their size.
- if [ $stage -le 6 ]; then
- $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
- $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- fi
- rm $dir/pre_lat.*.gz
-
-else ### For models without speaker vectors:
-
- if $use_fmllr; then
- # Estimate fMLLR transforms (note: these may be on top of any
- # fMLLR transforms estimated with the baseline GMM system.
- if [ $stage -le 5 ]; then # compute fMLLR transforms.
- echo "$0: computing fMLLR transforms."
- $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --speedup=true --utt2spk=ark:$sdata/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
- --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
- $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
- fi
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
- fi
-
-# Now rescore the state-level lattices with the adapted features and the
-# corresponding model. Prune and determinize the lattices to limit
-# their size.
- if [ $stage -le 6 ] && $use_fmllr; then
- $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \
- $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- rm $dir/pre_lat.*.gz
- else # If no adaptation needed, determinize the lattice.
- $cmd JOB=1:$nj $dir/log/determinize.JOB.log \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam \
- "ark:gunzip -c $dir/pre_lat.JOB.gz|" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- rm $dir/pre_lat.*.gz
- fi
-
-fi
-
-if [ $stage -le 7 ]; then
- steps/diagnostic/analyze_lats.sh --cmd "$cmd" $graphdir $dir
-fi
-
-if [ $stage -le 8 ]; then
- if ! $skip_scoring ; then
- [ ! -x local/score.sh ] && \
- echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
- echo "score best paths"
- local/score.sh --cmd "$cmd" $data $graphdir $dir ||
- { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
- #echo "score confidence and timing with sclite"
- #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir
- fi
-fi
-echo "Decoding done."
-exit 0;
index 7a3a4f6bd48eef4d533e0e61c3b0372c3cf36584..c84e597192ea94916292c251802b903243b24e2f 100755 (executable)
. parse_options.sh || exit 1;
if [ $# -ne 4 ]; then
- echo "Usage: steps/decode_sgmm_fromlats.sh [options] <data-dir> <lang-dir> <old-decode-dir> <decode-dir>"
+ echo "Usage: steps/decode_sgmm2_fromlats.sh [options] <data-dir> <lang-dir> <old-decode-dir> <decode-dir>"
echo ""
echo "main options (for others, see top of script file)"
echo " --transform-dir <decoding-dir> # directory of previous decoding"
index a37a47350d7cbd0b552342950a3445432c54c18a..c258ad000675af560e71fb3c31e4e1c95d610efe 100755 (executable)
. parse_options.sh || exit 1;
if [ $# -ne 4 ]; then
- echo "Usage: steps/decode_sgmm_rescore.sh [options] <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
- echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
+ echo "Usage: steps/decode_sgmm2_rescore.sh [options] <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
+ echo " e.g.: steps/decode_sgmm2_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr"
echo "main options (for others, see top of script file)"
echo " --transform-dir <decoding-dir> # directory of previous decoding"
diff --git a/egs/wsj/s5/steps/decode_sgmm_fromlats.sh b/egs/wsj/s5/steps/decode_sgmm_fromlats.sh
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-
-# This script does decoding with an SGMM system, with speaker vectors.
-# If the SGMM system was
-# built on top of fMLLR transforms from a conventional system, you should
-# provide the --transform-dir option.
-# This script does not use a decoding graph, but instead you provide
-# a previous decoding directory with lattices in it. This script will only
-# make use of the word sequences in the lattices; it limits the decoding
-# to those sequences. You should also provide a "lang" directory from
-# which this script will use the G.fst and L.fst.
-
-# Begin configuration section.
-stage=1
-alignment_model=
-transform_dir= # dir to find fMLLR transforms.
-acwt=0.08333 # Just a default value, used for adaptation and beam-pruning..
-batch_size=75 # Limits memory blowup in compile-train-graphs-fsts
-cmd=run.pl
-beam=20.0
-gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note:
- # the first_pass_gselect variable is used for the 1st pass of
- # decoding and can be tighter.
-first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
- # the 1st pass of decoding (lattice generation).
-max_active=7000
-
-#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming
-# in the other scripts
-lattice_beam=8.0 # Beam we use in lattice generation.
-vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
- # speaker-vector computation. Can be quite tight (actually we could
- # probably just do best-path.
-use_fmllr=false
-fmllr_iters=10
-fmllr_min_count=1000
-scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
-skip_scoring=false
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# -ne 4 ]; then
- echo "Usage: steps/decode_sgmm_fromlats.sh [options] <data-dir> <lang-dir> <old-decode-dir> <decode-dir>"
- echo ""
- echo "main options (for others, see top of script file)"
- echo " --transform-dir <decoding-dir> # directory of previous decoding"
- echo " # where we can find transforms for SAT systems."
- echo " --alignment-model <ali-mdl> # Model for the first-pass decoding."
- echo " --config <config-file> # config containing options"
- echo " --cmd <cmd> # Command to run in parallel with"
- echo " --beam <beam> # Decoding beam; default 13.0"
- exit 1;
-fi
-
-data=$1
-lang=$2
-olddir=$3
-dir=$4
-srcdir=`dirname $dir`
-
-for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \
- $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-nj=`cat $olddir/num_jobs` || exit 1;
-sdata=$data/split$nj;
-silphonelist=`cat $lang/phones/silence.csl` || exit 1
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
-gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
-
-mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-echo $nj > $dir/num_jobs
-
-
-## Set up features
-
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then
- transform_dir=$olddir
-fi
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
- [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
- && echo "$0: #jobs mismatch with transform-dir." && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
- echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
- echo " but you are not providing the --transform-dir option in test time."
-fi
-
-## Calculate FMLLR pre-transforms if needed. We are doing this here since this
-## step is requried by models both with and without speaker vectors
-if $use_fmllr; then
- if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
- echo "$0: computing pre-transform for fMLLR computation."
- sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
- fi
-fi
-
-## Save Gaussian-selection info to disk.
-# Note: we can use final.mdl regardless of whether there is an alignment model--
-# they use the same UBM.
-if [ $stage -le 1 ]; then
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
- "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
-fi
-
-## Work out name of alignment model. ##
-if [ -z "$alignment_model" ]; then
- if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
- else alignment_model=$srcdir/final.mdl; fi
-fi
-[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
-
-# Generate state-level lattice which we can rescore. This is done with the
-# alignment model and no speaker-vectors.
-if [ $stage -le 2 ]; then
- $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
- lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
- fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
- fstdeterminizestar ark:- ark:- \| \
- compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
- --batch-size=$batch_size $scale_opts \
- $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
- sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
- --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
- --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \
- "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
-fi
-
-## Check if the model has speaker vectors
-spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`
-
-if [ $spkdim -gt 0 ]; then ### For models with speaker vectors:
-
-# Estimate speaker vectors (1st pass). Prune before determinizing
-# because determinization can take a while on un-pruned lattices.
-# Note: the sgmm-post-to-gpost stage is necessary because we have
-# a separate alignment-model and final model, otherwise we'd skip it
-# and use sgmm-est-spkvecs.
- if [ $stage -le 3 ]; then
- $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
- sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
- sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \
- $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
- fi
-
-# Estimate speaker vectors (2nd pass). Since we already have spk vectors,
-# at this point we need to rescore the lattice to get the correct posteriors.
- if [ $stage -le 4 ]; then
- $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
- $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
- fi
- rm $dir/pre_vecs.*
-
- if $use_fmllr; then
- # Estimate fMLLR transforms (note: these may be on top of any
- # fMLLR transforms estimated with the baseline GMM system.
- if [ $stage -le 5 ]; then # compute fMLLR transforms.
- echo "$0: computing fMLLR transforms."
- $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
- --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
- $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
- fi
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
- fi
-
-# Now rescore the state-level lattices with the adapted features and the
-# corresponding model. Prune and determinize the lattices to limit
-# their size.
- if [ $stage -le 6 ]; then
- $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
- $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- fi
- rm $dir/pre_lat.*.gz
-
-else ### For models without speaker vectors:
-
- if $use_fmllr; then
- # Estimate fMLLR transforms (note: these may be on top of any
- # fMLLR transforms estimated with the baseline GMM system.
- if [ $stage -le 5 ]; then # compute fMLLR transforms.
- echo "$0: computing fMLLR transforms."
- $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \
- --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
- $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
- fi
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
- fi
-
-# Now rescore the state-level lattices with the adapted features and the
-# corresponding model. Prune and determinize the lattices to limit
-# their size.
- if [ $stage -le 6 ] && $use_fmllr; then
- $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \
- $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- rm $dir/pre_lat.*.gz
- else # Already done with decoding if no adaptation needed.
- for n in `seq 1 $nj`; do
- mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz
- done
- fi
-
-fi
-
-# The output of this script is the files "lat.*.gz"-- we'll rescore this at
-# different acoustic scales to get the final output.
-
-
-if [ $stage -le 7 ]; then
- if ! $skip_scoring ; then
- [ ! -x local/score.sh ] && \
- echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
- echo "score best paths"
- local/score.sh --cmd "$cmd" $data $lang $dir ||
- { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
- # echo "score confidence and timing with sclite"
- # local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $lang $dir
- fi
-fi
-echo "Decoding done."
-exit 0;
diff --git a/egs/wsj/s5/steps/decode_sgmm_rescore.sh b/egs/wsj/s5/steps/decode_sgmm_rescore.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-
-# This script does decoding with an SGMM system, by rescoring lattices
-# generated from a previous SGMM system. The directory with the lattices
-# is assumed to contain speaker vectors, if used. Basically it rescores
-# the lattices one final time, using the same setup as the final decoding
-# pass of the source dir. The assumption is that the model may have
-# been discriminatively trained.
-
-# If the system was built on top of fMLLR transforms from a conventional system,
-# you should provide the --transform-dir option.
-
-# Begin configuration section.
-transform_dir= # dir to find fMLLR transforms.
-cmd=run.pl
-iter=final
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# -ne 4 ]; then
- echo "Usage: steps/decode_sgmm_rescore.sh [options] <graph-dir|lang-dir> <data-dir> <old-decode-dir> <decode-dir>"
- echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
- echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr"
- echo "main options (for others, see top of script file)"
- echo " --transform-dir <decoding-dir> # directory of previous decoding"
- echo " # where we can find transforms for SAT systems."
- echo " --config <config-file> # config containing options"
- echo " --cmd <cmd> # Command to run in parallel with"
- echo " --iter <iter> # iteration of model to use (default: final)"
- exit 1;
-fi
-
-graphdir=$1
-data=$2
-olddir=$3
-dir=$4
-srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
-
-for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \
- $srcdir/$iter.mdl; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-nj=`cat $olddir/num_jobs` || exit 1;
-sdata=$data/split$nj;
-gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|"
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-
-mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-echo $nj > $dir/num_jobs
-
-if [ -f $olddir/vecs.1 ]; then
- echo "$0: using speaker vectors from $olddir"
- spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
-else
- echo "$0: no speaker vectors found."
- spkvecs_opt=
-fi
-
-
-## Set up features.
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
- [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
- && echo "$0: #jobs mismatch with transform-dir." && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
- echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
- echo " but you are not providing the --transform-dir option in test time."
-fi
-
-if [ -f $olddir/trans.1 ]; then
- echo "$0: using (in addition to any previous transforms) transforms from $olddir"
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |"
-fi
-##
-
-# Rescore the state-level lattices with the model provided. Just
-# one command in this script.
-echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl"
-$cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt \
- $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
-
-[ ! -x local/score.sh ] && \
- echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
-local/score.sh --cmd "$cmd" $data $graphdir $dir
-
-exit 0;
diff --git a/egs/wsj/s5/steps/make_denlats_sgmm.sh b/egs/wsj/s5/steps/make_denlats_sgmm.sh
+++ /dev/null
@@ -1,189 +0,0 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-# 2014 Guoguo Chen
-
-# Create denominator lattices for MMI/MPE training, with SGMM models. If the
-# features have fMLLR transforms you have to supply the --transform-dir option.
-# It gets any speaker vectors from the "alignment dir" ($alidir). Note: this is
-# possibly a slight mismatch because the speaker vectors come from supervised
-# adaptation.
-
-# Begin configuration section.
-nj=4
-cmd=run.pl
-sub_split=1
-beam=13.0
-lattice_beam=7.0
-acwt=0.1
-max_active=5000
-transform_dir=
-max_mem=20000000 # This will stop the processes getting too large.
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# != 4 ]; then
- echo "Usage: steps/make_denlats_sgmm.sh [options] <data-dir> <lang-dir> <src-dir|alidir> <exp-dir>"
- echo " e.g.: steps/make_denlats_sgmm.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats"
- echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
- echo " plus transforms."
- echo ""
- echo "Main options (for others, see top of script file)"
- echo " --config <config-file> # config containing options"
- echo " --nj <nj> # number of parallel jobs"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- echo " --sub-split <n-split> # e.g. 40; use this for "
- echo " # large databases so your jobs will be smaller and"
- echo " # will (individually) finish reasonably soon."
- echo " --transform-dir <transform-dir> # directory to find fMLLR transforms."
- exit 1;
-fi
-
-data=$1
-lang=$2
-alidir=$3 # could also be $srcdir, but only if no vectors supplied.
-dir=$4
-
-sdata=$data/split$nj
-splice_opts=`cat $alidir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-echo $nj > $dir/num_jobs
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
-
-oov=`cat $lang/oov.int` || exit 1;
-
-mkdir -p $dir
-
-cp -RH $lang $dir/
-
-# Compute grammar FST which corresponds to unigram decoding graph.
-new_lang="$dir/"$(basename "$lang")
-echo "$0: Making unigram grammar FST in $new_lang"
-cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
- awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
- utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
- || exit 1;
-
-# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
-# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
-# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph.
-
-echo "$0: Compiling decoding graph in $dir/dengraph"
-if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
- echo "$0: Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
-else
- utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1;
-fi
-
-if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
- cp $alidir/final.mat $dir
- ;;
- *) echo "$0: Invalid feature type $feat_type" && exit 1;
-esac
-
-if [ ! -z "$transform_dir" ]; then # add transforms to features...
- echo "$0: using fMLLR transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
- [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
- && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
- [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \
- echo "$0: LDA transforms differ between $alidir and $transform_dir"
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
-else
- echo "$0: Assuming you don't have a SAT system, since no --transform-dir option supplied "
-fi
-
-if [ -f $alidir/gselect.1.gz ]; then
- gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|"
-else
- echo "$0: no such file $alidir/gselect.1.gz" && exit 1;
-fi
-
-if [ -f $alidir/vecs.1 ]; then
- spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
-else
- if [ -f $alidir/final.alimdl ]; then
- echo "$0: You seem to have an SGMM system with speaker vectors,"
- echo "yet we can't find speaker vectors. Perhaps you supplied"
- echo "the model director instead of the alignment directory?"
- exit 1;
- fi
-fi
-
-# if this job is interrupted by the user, we want any background jobs to be
-# killed too.
-cleanup() {
- local pids=$(jobs -pr)
- [ -n "$pids" ] && kill $pids
-}
-trap "cleanup" INT QUIT TERM EXIT
-
-if [ $sub_split -eq 1 ]; then
- $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
- sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \
- --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
- --max-mem=$max_mem --max-active=$max_active \
- --word-symbol-table=$lang/words.txt $alidir/final.mdl \
- $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
-else
- # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
- # to have at most two jobs running at each time. The idea is that if we have
- # stragglers from one job, we can be processing another one at the same time.
- rm $dir/.error 2>/dev/null
-
- prev_pid=
- for n in `seq $[nj+1]`; do
- if [ $n -gt $nj ]; then
- this_pid=
- elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then
- echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)";
- this_pid=
- else
- sdata2=$data/split$nj/$n/split${sub_split}utt;
- split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
- mkdir -p $dir/log/$n
- mkdir -p $dir/part
- feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`
- spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"`
- gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"`
- $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
- sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \
- --beam=$beam --lattice-beam=$lattice_beam \
- --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \
- --word-symbol-table=$lang/words.txt $alidir/final.mdl \
- $dir/dengraph/HCLG.fst "$feats_subset" \
- "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
- this_pid=$!
- fi
- if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices.
- wait $prev_pid
- [ -f $dir/.error ] && \
- echo "$0: error generating denominator lattices" && exit 1;
- rm $dir/.merge_error 2>/dev/null
- echo "$0: Merging archives for data subset $prev_n"
- for k in `seq $sub_split`; do
- gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
- done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
- [ -f $dir/.merge_error ] && \
- echo "$0: Merging lattices for subset $prev_n failed" && exit 1;
- rm $dir/lat.$prev_n.*.gz
- touch $dir/.done.$prev_n
- fi
- prev_n=$n
- prev_pid=$this_pid
- done
-fi
-
-
-echo "$0: done generating denominator lattices with SGMMs."
diff --git a/egs/wsj/s5/steps/tandem/align_sgmm.sh b/egs/wsj/s5/steps/tandem/align_sgmm.sh
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-# Korbinian Riedhammer
-# Apache 2.0
-
-# Computes training alignments and (if needed) speaker-vectors, given an
-# SGMM system. If the system is built on top of SAT, you should supply
-# transforms with the --transform-dir option.
-
-# If you supply the --use-graphs option, it will use the training
-# graphs from the source directory.
-
-# Begin configuration section.
-stage=0
-nj=4
-cmd=run.pl
-use_graphs=false # use graphs from srcdir
-use_gselect=false # use gselect info from srcdir [regardless, we use
- # Gaussian-selection info, we might have to compute it though.]
-gselect=15 # Number of Gaussian-selection indices for SGMMs.
-# Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-beam=10
-retry_beam=40
-transform_dir= # directory to find fMLLR transforms in.
-# End configuration options.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f path.sh ] && . ./path.sh # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# != 5 ]; then
- echo "usage: steps/tandem/align_sgmm.sh <data1-dir> <data2-dir> <lang-dir> <src-dir> <align-dir>"
- echo "e.g.: steps/tandem/align_sgmm.sh --transform-dir exp/tri3b data1/train data1/lang \\"
- echo " exp/sgmm4a exp/sgmm5a_ali"
- echo "main options (for others, see top of script file)"
- echo " --config <config-file> # config containing options"
- echo " --nj <nj> # number of parallel jobs"
- echo " --use-graphs true # use graphs in src-dir"
- echo " --transform-dir <transform-dir> # directory to find fMLLR transforms"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- exit 1;
-fi
-
-data1=$1
-data2=$2
-lang=$3
-srcdir=$4
-dir=$5
-
-oov=`cat $lang/oov.int` || exit 1;
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
-
-mkdir -p $dir/log
-echo $nj > $dir/num_jobs
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-## Set up features.
-
-sdata1=$data1/split$nj
-sdata2=$data2/split$nj
-[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
-[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;
-
-cp $srcdir/{tree,final.mdl} $dir || exit 1;
-[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir
-cp $srcdir/final.occs $dir;
-
-## Set up features.
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
-normft2=`cat $srcdir/normft2 2>/dev/null`
-
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-
-case $feat_type in
- delta)
- echo "$0: feature type is $feat_type"
- ;;
- lda)
- echo "$0: feature type is $feat_type"
- cp $srcdir/{lda,final}.mat $dir/ || exit 1;
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-
-# set up feature stream 1; this are usually spectral features, so we will add
-# deltas or splice them
-feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"
-
-if [ "$feat_type" == "delta" ]; then
- feats1="$feats1 add-deltas ark:- ark:- |"
-elif [ "$feat_type" == "lda" ]; then
- feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
-fi
-
-# set up feature stream 2; this are usually bottleneck or posterior features,
-# which may be normalized if desired
-feats2="scp:$sdata2/JOB/feats.scp"
-
-if [ "$normft2" == "true" ]; then
- feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
-fi
-
-# assemble tandem features
-feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
-
-# add transformation, if applicable
-if [ "$feat_type" == "lda" ]; then
- feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
-fi
-
-# splicing/normalization options
-cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null
-
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
- [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
- && echo "$0: #jobs mismatch with transform-dir." && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
- echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
- echo " but you are not providing the --transform-dir option during alignment."
-fi
-##
-
-## Set up model and alignment model.
-mdl=$srcdir/final.mdl
-if [ -f $srcdir/final.alimdl ]; then
- alimdl=$srcdir/final.alimdl
-else
- alimdl=$srcdir/final.mdl
-fi
-[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
-
-## Work out where we're getting the graphs from.
-if $use_graphs; then
- [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
- echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1;
- [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1;
- graphdir=$srcdir
- ln.pl $srcdir/fsts.*.gz $dir
-else
- graphdir=$dir
- if [ $stage -le 0 ]; then
- echo "$0: compiling training graphs"
- tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata1/JOB/text|";
- $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
- compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl $lang/L.fst "$tra" \
- "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
- fi
-fi
-
-## Work out where we're getting the Gaussian-selection info from
-if $use_gselect; then
- [ "$nj" != "`cat $srcdir/num_jobs`" ] && \
- echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1;
- [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1;
- graphdir=$srcdir
- gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|"
- ln.pl $srcdir/gselect.*.gz $dir
-else
- graphdir=$dir
- if [ $stage -le 1 ]; then
- echo "$0: computing Gaussian-selection info"
- # Note: doesn't matter whether we use $alimdl or $mdl, they will
- # have the same gselect info.
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect --full-gmm-nbest=$gselect $alimdl \
- "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
- fi
- gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|"
-fi
-
-
-if [ $alimdl == $mdl ]; then
- # Speaker-independent decoding-- just one pass. Not normal.
- T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1;
- [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1;
-
- if [ $stage -le 2 ]; then
- echo "$0: aligning data in $data using model $mdl (no speaker-vectors)"
- $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
- sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \
- "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
- fi
- echo "$0: done aligning data."
- exit 0;
-fi
-
-# Continue with system with speaker vectors.
-if [ $stage -le 2 ]; then
- echo "$0: aligning data in $data using model $alimdl"
- $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
- sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \
- "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
-fi
-
-if [ $stage -le 3 ]; then
- echo "$0: computing speaker vectors (1st pass)"
- $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \
- ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
- weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
- sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \
- sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata1/JOB/spk2utt \
- $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1;
-fi
-
-if [ $stage -le 4 ]; then
- echo "$0: computing speaker vectors (2nd pass)"
- $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \
- ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
- weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" \
- --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1;
- rm $dir/pre_vecs.*
-fi
-
-if [ $stage -le 5 ]; then
- echo "$0: doing final alignment."
- $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \
- sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \
- --utt2spk=ark:$sdata1/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
- $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-fi
-
-rm $dir/pre_ali.*.gz
-
-echo "$0: done aligning data."
-
-utils/summarize_warnings.pl $dir/log
-
-exit 0;
diff --git a/egs/wsj/s5/steps/tandem/decode_sgmm.sh b/egs/wsj/s5/steps/tandem/decode_sgmm.sh
+++ /dev/null
@@ -1,303 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-# Korbinian Riedhammer
-
-# This script does decoding with an SGMM system, with speaker vectors.
-# If the SGMM system was
-# built on top of fMLLR transforms from a conventional system, you should
-# provide the --transform-dir option.
-
-# Begin configuration section.
-stage=1
-alignment_model=
-transform_dir= # dir to find fMLLR transforms.
-nj=4 # number of decoding jobs.
-acwt=0.1 # Just a default value, used for adaptation and beam-pruning..
-cmd=run.pl
-beam=15.0
-gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note:
- # the first_pass_gselect variable is used for the 1st pass of
- # decoding and can be tighter.
-first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
- # the 1st pass of decoding (lattice generation).
-max_active=7000
-
-#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming
-# in the other scripts
-lattice_beam=8.0 # Beam we use in lattice generation.
-vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
- # speaker-vector computation. Can be quite tight (actually we could
- # probably just do best-path.
-use_fmllr=false
-fmllr_iters=10
-fmllr_min_count=1000
-skip_scoring=false
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# -ne 4 ]; then
- echo "Usage: steps/tandem/decode_sgmm.sh [options] <graph-dir> <data1-dir> <data2-dir> <decode-dir>"
- echo " e.g.: steps/tandem/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\"
- echo " exp/sgmm3a/graph_tgpr {mfcc,bottleneck}/data/test_dev93 exp/sgmm3a/decode_dev93_tgpr"
- echo "main options (for others, see top of script file)"
- echo " --transform-dir <decoding-dir> # directory of previous decoding"
- echo " # where we can find transforms for SAT systems."
- echo " --alignment-model <ali-mdl> # Model for the first-pass decoding."
- echo " --config <config-file> # config containing options"
- echo " --nj <nj> # number of parallel jobs"
- echo " --cmd <cmd> # Command to run in parallel with"
- echo " --beam <beam> # Decoding beam; default 13.0"
- exit 1;
-fi
-
-graphdir=$1
-data1=$2
-data2=$3
-dir=$4
-srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
-
-for f in $graphdir/HCLG.fst $data1/feats.scp $data2/feats.scp $srcdir/final.mdl; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-silphonelist=`cat $graphdir/phones/silence.csl` || exit 1
-gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|"
-gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |"
-
-mkdir -p $dir/log
-echo $nj > $dir/num_jobs
-
-sdata1=$data1/split$nj;
-sdata2=$data2/split$nj;
-[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
-[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;
-
-
-## Set up features.
-
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
-normft2=`cat $srcdir/normft2 2>/dev/null`
-
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-
-case $feat_type in
- delta)
- echo "$0: feature type is $feat_type"
- ;;
- lda)
- echo "$0: feature type is $feat_type"
- cp $srcdir/{lda,final}.mat $dir/
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-
-# set up feature stream 1; this are usually spectral features, so we will add
-# deltas or splice them
-feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"
-
-if [ "$feat_type" == "delta" ]; then
- feats1="$feats1 add-deltas ark:- ark:- |"
-elif [ "$feat_type" == "lda" ]; then
- feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
-fi
-
-# set up feature stream 2; this are usually bottleneck or posterior features,
-# which may be normalized if desired
-feats2="scp:$sdata2/JOB/feats.scp"
-
-if [ "$normft2" == "true" ]; then
- echo "Using cmvn for feats2"
- feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
-fi
-
-# assemble tandem features
-feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
-
-# add transformation, if applicable
-if [ "$feat_type" == "lda" ]; then
- feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
-fi
-
-# splicing/normalization options
-cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null
-
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1;
- [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \
- && echo "$0: #jobs mismatch with transform-dir." && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then
- echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms,"
- echo " but you are not providing the --transform-dir option in test time."
-fi
-##
-
-
-## Calculate FMLLR pre-transforms if needed. We are doing this here since this
-## step is requried by models both with and without speaker vectors
-if $use_fmllr; then
- if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then
- echo "$0: computing pre-transform for fMLLR computation."
- sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1;
- fi
-fi
-
-## Save Gaussian-selection info to disk.
-# Note: we can use final.mdl regardless of whether there is an alignment model--
-# they use the same UBM.
-if [ $stage -le 1 ]; then
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \
- "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
-fi
-
-## Work out name of alignment model. ##
-if [ -z "$alignment_model" ]; then
- if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl;
- else alignment_model=$srcdir/final.mdl; fi
-fi
-[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1;
-
-# Generate state-level lattice which we can rescore. This is done with the
-# alignment model and no speaker-vectors.
-if [ $stage -le 2 ]; then
- $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
- sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
- --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \
- --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \
- $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1;
-fi
-
-## Check if the model has speaker vectors
-spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'`
-
-if [ $spkdim -gt 0 ]; then ### For models with speaker vectors:
-
-# Estimate speaker vectors (1st pass). Prune before determinizing
-# because determinization can take a while on un-pruned lattices.
-# Note: the sgmm-post-to-gpost stage is necessary because we have
-# a separate alignment-model and final model, otherwise we'd skip it
-# and use sgmm-est-spkvecs.
- if [ $stage -le 3 ]; then
- $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \
- sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \
- sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata1/JOB/spk2utt \
- $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1;
- fi
-
-# Estimate speaker vectors (2nd pass). Since we already have spk vectors,
-# at this point we need to rescore the lattice to get the correct posteriors.
- if [ $stage -le 4 ]; then
- $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \
- $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1;
- fi
- rm $dir/pre_vecs.*
-
- if $use_fmllr; then
- # Estimate fMLLR transforms (note: these may be on top of any
- # fMLLR transforms estimated with the baseline GMM system.
- if [ $stage -le 5 ]; then # compute fMLLR transforms.
- echo "$0: computing fMLLR transforms."
- $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-fmllr --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \
- --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
- $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
- fi
- feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
- fi
-
-# Now rescore the state-level lattices with the adapted features and the
-# corresponding model. Prune and determinize the lattices to limit
-# their size.
- if [ $stage -le 6 ]; then
- $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata1/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \
- $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- fi
- rm $dir/pre_lat.*.gz
-
-else ### For models without speaker vectors:
-
- if $use_fmllr; then
- # Estimate fMLLR transforms (note: these may be on top of any
- # fMLLR transforms estimated with the baseline GMM system.
- if [ $stage -le 5 ]; then # compute fMLLR transforms.
- echo "$0: computing fMLLR transforms."
- $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
- gunzip -c $dir/pre_lat.JOB.gz \| \
- sgmm-rescore-lattice --utt2spk=ark:$sdata1/JOB/utt2spk \
- "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \
- lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \
- sgmm-est-fmllr --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" \
- --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \
- $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1;
- fi
- feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
- fi
-
-# Now rescore the state-level lattices with the adapted features and the
-# corresponding model. Prune and determinize the lattices to limit
-# their size.
- if [ $stage -le 6 ] && $use_fmllr; then
- $cmd JOB=1:$nj $dir/log/rescore.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata1/JOB/utt2spk \
- $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \
- lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \
- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
- rm $dir/pre_lat.*.gz
- else # Already done with decoding if no adaptation needed.
- for n in `seq 1 $nj`; do
- mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz
- done
- fi
-
-fi
-
-# The output of this script is the files "lat.*.gz"-- we'll rescore this at
-# different acoustic scales to get the final output.
-
-
-if [ $stage -le 7 ]; then
- if ! $skip_scoring ; then
- [ ! -x local/score.sh ] && \
- echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
- echo "score best paths"
- local/score.sh --cmd "$cmd" $data $graphdir $dir ||
- { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
- # echo "score confidence and timing with sclite"
- # local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir
- fi
-fi
-echo "Decoding done."
-exit 0;
diff --git a/egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh b/egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-# Korbinian Riedhammer
-
-# Create denominator lattices for MMI/MPE training, with SGMM models. If the
-# features have fMLLR transforms you have to supply the --transform-dir option.
-# It gets any speaker vectors from the "alignment dir" ($srcdir). Note: this is
-# possibly a slight mismatch because the speaker vectors come from supervised
-# adaptation.
-
-# Begin configuration section.
-nj=4
-cmd=run.pl
-sub_split=1
-beam=13.0
-lattice_beam=7.0
-acwt=0.1
-max_active=5000
-transform_dir=
-max_mem=20000000 # This will stop the processes getting too large.
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# != 5 ]; then
- echo "Usage: steps/tandem/make_denlats_sgmm.sh [options] <data1-dir> <data2-dir> <lang-dir> <src-dir|srcdir> <exp-dir>"
- echo " e.g.: steps/tandem/make_denlats_sgmm.sh {mfcc,bottleneck}/data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats"
- echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
- echo " plus transforms."
- echo ""
- echo "Main options (for others, see top of script file)"
- echo " --config <config-file> # config containing options"
- echo " --nj <nj> # number of parallel jobs"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- echo " --sub-split <n-split> # e.g. 40; use this for "
- echo " # large databases so your jobs will be smaller and"
- echo " # will (individually) finish reasonably soon."
- echo " --transform-dir <transform-dir> # directory to find fMLLR transforms."
- exit 1;
-fi
-
-data1=$1
-data2=$2
-lang=$3
-srcdir=$4 # could also be $srcdir, but only if no vectors supplied.
-dir=$5
-
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-normft2=`cat $srcdir/normft2 2>/dev/null`
-mkdir -p $dir/log
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
-
-sdata1=$data1/split$nj
-sdata2=$data2/split$nj
-[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
-[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;
-
-echo $nj > $dir/num_jobs
-
-oov=`cat $lang/oov.int` || exit 1;
-
-mkdir -p $dir
-
-cp -r $lang $dir/
-
-# Compute grammar FST which corresponds to unigram decoding graph.
-
-cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
- awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
- utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \
- || exit 1;
-
-# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
-# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
-# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
-
-if [ -s $dir/dengraph/HCLG.fst ]; then
- echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
-else
- utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1;
-fi
-
-# Set up features
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-
-case $feat_type in
- delta)
- echo "$0: feature type is $feat_type"
- ;;
- lda)
- echo "$0: feature type is $feat_type"
- cp $srcdir/{lda,final}.mat $dir/ || exit 1;
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-
-# set up feature stream 1; this are usually spectral features, so we will add
-# deltas or splice them
-feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"
-
-if [ "$feat_type" == "delta" ]; then
- feats1="$feats1 add-deltas ark:- ark:- |"
-elif [ "$feat_type" == "lda" ]; then
- feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
-fi
-
-# set up feature stream 2; this are usually bottleneck or posterior features,
-# which may be normalized if desired
-feats2="scp:$sdata2/JOB/feats.scp"
-
-if [ "$normft2" == "true" ]; then
- feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
-fi
-
-# assemble tandem features
-feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
-
-# add transformation, if applicable
-if [ "$feat_type" == "lda" ]; then
- feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
-fi
-
-# splicing/normalization options
-cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null
-
-
-if [ ! -z "$transform_dir" ]; then # add transforms to features...
- echo "$0: using fMLLR transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist."
- [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \
- && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1;
- [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
- echo "$0: LDA transforms differ between $srcdir and $transform_dir"
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
-else
- echo "Assuming you don't have a SAT system, since no --transform-dir option supplied "
-fi
-
-if [ -f $srcdir/gselect.1.gz ]; then
- gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|"
-else
- echo "$0: no such file $srcdir/gselect.1.gz" && exit 1;
-fi
-
-if [ -f $srcdir/vecs.1 ]; then
- spkvecs_opt="--spk-vecs=ark:$srcdir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
-else
- if [ -f $srcdir/final.alimdl ]; then
- echo "You seem to have an SGMM system with speaker vectors,"
- echo "yet we can't find speaker vectors. Perhaps you supplied"
- echo "the model director instead of the alignment directory?"
- exit 1;
- fi
-fi
-
-if [ $sub_split -eq 1 ]; then
- $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
- sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \
- --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
- --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \
- $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
-else
- for n in `seq $nj`; do
- if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
- echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
- else
- ssdata1=$data1/split$nj/$n/split${sub_split}utt;
- split_data.sh --per-utt $sdata1/$n $sub_split || exit 1;
- ssdata2=$data2/split$nj/$n/split${sub_split}utt;
- split_data.sh --per-utt $sdata2/$n $sub_split || exit 1;
- mkdir -p $dir/log/$n
- mkdir -p $dir/part
- feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g`
- spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"`
- gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"`
- $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
- sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \
- --beam=$beam --lattice-beam=$lattice_beam \
- --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \
- --word-symbol-table=$lang/words.txt $srcdir/final.mdl \
- $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
- echo Merging archives for data subset $n
- rm $dir/.error 2>/dev/null;
- for k in `seq $sub_split`; do
- gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
- done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
- [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
- rm $dir/lat.$n.*.gz
- touch $dir/.done.$n
- fi
- done
-fi
-
-
-echo "$0: done generating denominator lattices with SGMMs."
diff --git a/egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh b/egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-# Korbinian Riedhammer
-
-# MMI training (or optionally boosted MMI, if you give the --boost option),
-# for SGMMs. 4 iterations (by default) of Extended Baum-Welch update.
-#
-# Begin configuration section.
-cmd=run.pl
-num_iters=4
-boost=0.0
-cancel=true # if true, cancel num and den counts on each frame.
-acwt=0.1
-stage=0
-
-update_opts=
-transform_dir=
-# End configuration section
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# -ne 6 ]; then
- echo "Usage: steps/tandem/train_mmi_sgmm.sh <data1> <data2> <lang> <ali> <denlats> <exp>"
- echo " e.g.: steps/tandem/train_mmi_sgmm.sh {mfcc,bottleneck}/data1/train_si84 data1/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
- echo "Main options (for others, see top of script file)"
- echo " --boost <boost-weight> # (e.g. 0.1), for boosted MMI. (default 0)"
- echo " --cancel (true|false) # cancel stats (true by default)"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- echo " --config <config-file> # config containing options"
- echo " --stage <stage> # stage to do partial re-run from."
- echo " --transform-dir <transform-dir> # directory to find fMLLR transforms."
- exit 1;
-fi
-
-data1=$1
-data2=$2
-lang=$3
-alidir=$4
-denlatdir=$5
-dir=$6
-mkdir -p $dir/log
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-for f in $data1/feats.scp $data2/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-nj=`cat $alidir/num_jobs` || exit 1;
-[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
- echo "$alidir and $denlatdir have different num-jobs" && exit 1;
-
-mkdir -p $dir/log
-echo $nj > $dir/num_jobs
-
-cp $alidir/{final.mdl,tree} $dir
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
-
-# Set up features
-
-sdata1=$data1/split$nj
-sdata2=$data2/split$nj
-[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
-[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;
-
-splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
-normft2=`cat $alidir/normft2 2>/dev/null`
-
-if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-
-case $feat_type in
- delta)
- echo "$0: feature type is $feat_type"
- ;;
- lda)
- echo "$0: feature type is $feat_type"
- cp $alidir/{lda,final}.mat $dir/ || exit 1;
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-
-# set up feature stream 1; this are usually spectral features, so we will add
-# deltas or splice them
-feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"
-
-if [ "$feat_type" == "delta" ]; then
- feats1="$feats1 add-deltas ark:- ark:- |"
-elif [ "$feat_type" == "lda" ]; then
- feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
-fi
-
-# set up feature stream 2; this are usually bottleneck or posterior features,
-# which may be normalized if desired
-feats2="scp:$sdata2/JOB/feats.scp"
-
-if [ "$normft2" == "true" ]; then
- feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
-fi
-
-# assemble tandem features
-feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
-
-# add transformation, if applicable
-if [ "$feat_type" == "lda" ]; then
- feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
-fi
-
-# splicing/normalization options
-cp $alidir/{splice_opts,normft2,tandem} $dir 2>/dev/null
-
-
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \
- && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-else
- echo "$0: no fMLLR transforms."
-fi
-
-if [ -f $alidir/vecs.1 ]; then
- echo "$0: using speaker vectors from $alidir"
- spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk"
-else
- echo "$0: no speaker vectors."
- spkvecs_opt=
-fi
-
-if [ -f $alidir/gselect.1.gz ]; then
- echo "$0: using Gaussian-selection info from $alidir"
- gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|"
-else
- echo "$0: error: no Gaussian-selection info found" && exit 1;
-fi
-
-lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
-if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
- lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
-fi
-
-
-cur_mdl=$alidir/final.mdl
-x=0
-while [ $x -lt $num_iters ]; do
- echo "Iteration $x of MMI training"
- # Note: the num and den states are accumulated at the same time, so we
- # can cancel them per frame.
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
- sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt $cur_mdl "$lats" "$feats" ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- sum-post --merge=$cancel --scale1=-1 \
- ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
- sgmm-acc-stats2 "$gselect_opt" $spkvecs_opt $cur_mdl "$feats" ark,s,cs:- \
- $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;
-
- n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
- [ "$n" -ne $[$nj*2] ] && \
- echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
- $cmd $dir/log/den_acc_sum.$x.log \
- sgmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
- rm $dir/den_acc.$x.*.acc
- $cmd $dir/log/num_acc_sum.$x.log \
- sgmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
- rm $dir/num_acc.$x.*.acc
-
- $cmd $dir/log/update.$x.log \
- sgmm-est-ebw $update_opts $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
- fi
- cur_mdl=$dir/$[$x+1].mdl
-
-
- # Some diagnostics: the objective function progress and auxiliary-function
- # improvement. Note: this code is same as in train_mmi.sh
- tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while(<STDIN>) { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf
- objf=`cat $dir/tmpf | awk '{print $1}'`;
- nf=`cat $dir/tmpf | awk '{print $2}'`;
- rm $dir/tmpf
- impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
- impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
- echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log
- x=$[$x+1]
-done
-
-echo "MMI training finished"
-
-rm $dir/final.mdl 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-
-exit 0;
diff --git a/egs/wsj/s5/steps/tandem/train_sgmm.sh b/egs/wsj/s5/steps/tandem/train_sgmm.sh
+++ /dev/null
@@ -1,315 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-# Korbinian Riedhammer
-
-# SGMM training, with speaker vectors. This script would normally be called on
-# top of fMLLR features obtained from a conventional system, but it also works
-# on top of any type of speaker-independent features (based on
-# deltas+delta-deltas or LDA+MLLT). For more info on SGMMs, see the paper "The
-# subspace Gaussian mixture model--A structured model for speech recognition".
-# (Computer Speech and Language, 2011).
-
-# Begin configuration section.
-nj=4
-cmd=run.pl
-stage=-6
-context_opts= # e.g. set it to "--context-width=5 --central-position=2" for a
-# quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-num_iters=25 # Total number of iterations
-num_iters_alimdl=3 # Number of iterations for estimating alignment model.
-max_iter_inc=15 # Last iter to increase #substates on.
-realign_iters="5 10 15"; # Iters to realign on.
-spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
-increase_dim_iters="6 8"; # Iters on which to increase phn dim and/or spk dim;
- # rarely necessary, and if it is, only the 1st will normally be necessary.
-rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training.
-phn_dim= # You can use this to set the phonetic subspace dim. [default: feat-dim+1]
-spk_dim= # You can use this to set the speaker subspace dim. [default: feat-dim]
-power=0.2 # Exponent for number of gaussians according to occurrence counts
-beam=8
-retry_beam=40
-cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
-normft2=true
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-
-if [ $# != 8 ]; then
- echo "Usage: steps/tandem/train_sgmm.sh <num-leaves> <num-substates> <data1> <data2> <lang> <ali-dir> <ubm> <exp-dir>"
- echo " e.g.: steps/tandem/train_sgmm.sh 3500 10000 {mfcc,bottleneck},data/train_si84 data/lang \\"
- echo " exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a"
- echo "main options (for others, see top of script file)"
- echo " --config <config-file> # config containing options"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- echo " --silence-weight <sil-weight> # weight for silence (e.g. 0.5 or 0.0)"
- echo " --num-iters <#iters> # Number of iterations of E-M"
- exit 1;
-fi
-
-
-num_leaves=$1
-totsubstates=$2
-data1=$3
-data2=$4
-lang=$5
-alidir=$6
-ubm=$7
-dir=$8
-
-# Check some files.
-for f in $data1/feats.scp $data2/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-
-# Set some variables.
-oov=`cat $lang/oov.int`
-silphonelist=`cat $lang/phones/silence.csl`
-numsubstates=$num_leaves # Initial #-substates.
-incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates
-feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1;
-[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric.
-[ -z $phn_dim ] && phn_dim=$[$feat_dim+1]
-[ -z $spk_dim ] && spk_dim=$feat_dim
-nj=`cat $alidir/num_jobs` || exit 1;
-
-mkdir -p $dir/log
-echo $nj > $dir/num_jobs
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-sdata1=$data1/split$nj;
-sdata2=$data2/split$nj;
-[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1;
-[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1;
-
-spkvecs_opt= # Empty option for now, until we estimate the speaker vectors.
-gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
-
-## Set up features.
-splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
-normft2=`cat $alidir/normft2 2>/dev/null`
-
-if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-
-case $feat_type in
- delta)
- echo "$0: feature type is $feat_type"
- ;;
- lda)
- echo "$0: feature type is $feat_type"
- cp $alidir/{lda,final}.mat $dir/ || exit 1;
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-
-# set up feature stream 1; this are usually spectral features, so we will add
-# deltas or splice them
-feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |"
-
-if [ "$feat_type" == "delta" ]; then
- feats1="$feats1 add-deltas ark:- ark:- |"
-elif [ "$feat_type" == "lda" ]; then
- feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |"
-fi
-
-# set up feature stream 2; this are usually bottleneck or posterior features,
-# which may be normalized if desired
-feats2="scp:$sdata2/JOB/feats.scp"
-
-if [ "$normft2" == "true" ]; then
- feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |"
-fi
-
-# assemble tandem features
-feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |"
-
-# add transformation, if applicable
-if [ "$feat_type" == "lda" ]; then
- feats="$feats transform-feats $dir/final.mat ark:- ark:- |"
-fi
-
-# splicing/normalization options
-cp $alidir/{splice_opts,normft2,tandem} $dir 2>/dev/null
-
-if [ -f $alidir/trans.1 ]; then
- echo "$0: using transforms from $alidir"
- feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
-fi
-##
-
-
-if [ $stage -le -6 ]; then
- echo "$0: accumulating tree stats"
- $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
- acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
- "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
- [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
- sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
- rm $dir/*.treeacc
-fi
-
-if [ $stage -le -5 ]; then
- echo "$0: Getting questions for tree clustering."
- # preparing questions, roots file...
- cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
- cat $lang/phones/extra_questions.int >> $dir/questions.int
- compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
-
- echo "$0: Building the tree"
- $cmd $dir/log/build_tree.log \
- build-tree --verbose=1 --max-leaves=$num_leaves \
- --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
- $dir/questions.qst $lang/topo $dir/tree || exit 1;
-fi
-
-if [ $stage -le -4 ]; then
- echo "$0: Initializing the model"
- # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
- # will be truncated on initialization.
- $cmd $dir/log/init_sgmm.log \
- sgmm-init --phn-space-dim=$phn_dim --spk-space-dim=$spk_dim $lang/topo \
- $dir/tree $ubm $dir/0.mdl || exit 1;
-fi
-
-if [ $stage -le -3 ]; then
- echo "$0: doing Gaussian selection"
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect $dir/0.mdl "$feats" \
- "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
-fi
-
-if [ $stage -le -2 ]; then
- echo "$0: compiling training graphs"
- text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata1/JOB/text|"
- $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
- compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl $lang/L.fst \
- "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
-fi
-
-if [ $stage -le -1 ]; then
- echo "$0: Converting alignments"
- $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
- convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-fi
-
-x=0
-while [ $x -lt $num_iters ]; do
- echo "$0: training pass $x ... "
- if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
- echo "$0: re-aligning data"
- $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
- sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \
- --utt2spk=ark:$sdata1/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \
- $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
- fi
- if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \
- ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
- weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata1/JOB/spk2utt \
- $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \
- ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
- fi
- spkvecs_opt[$n]="--spk-vecs=ark:$dir/vecs.JOB"
- fi
- if [ $x -eq 0 ]; then
- flags=vwcSt # on the first iteration, don't update projections M or N
- elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then
- # Update N if we have speaker-vector space and x is odd,
- # and we've already updated the speaker vectors...
- flags=vNwcSt
- else
- # otherwise update M.
- flags=vMwcSt
- fi
-
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
- sgmm-acc-stats $spkvecs_opt --utt2spk=ark:$sdata1/JOB/utt2spk \
- --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \
- $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \
- $dir/$x.JOB.acc || exit 1;
- fi
-
- # The next option is needed if the user specifies a phone or speaker sub-space
- # dimension that's higher than the "normal" one.
- increase_dim_opts=
- if echo $increase_dim_iters | grep -w $x >/dev/null; then
- increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
- # Note: the command below might have a null effect on some iterations.
- if [ $spk_dim -gt $feat_dim ]; then
- cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
- copy-vector --print-args=false --change-dim=$spk_dim \
- ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
- mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1;
- fi
- fi
-
- if [ $stage -le $x ]; then
- $cmd $dir/log/update.$x.log \
- sgmm-est --update-flags=$flags --split-substates=$numsubstates $increase_dim_opts \
- --power=$power --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.*.acc|" \
- $dir/$[$x+1].mdl || exit 1;
- rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
- fi
-
- if [ $x -lt $max_iter_inc ]; then
- numsubstates=$[$numsubstates+$incsubstates]
- fi
- x=$[$x+1];
-done
-
-rm $dir/final.mdl $dir/final.occs 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-ln -s $x.occs $dir/final.occs
-
-if [ $spk_dim -gt 0 ]; then
- # We need to create an "alignment model" that's been trained
- # without the speaker vectors, to do the first-pass decoding with.
- # in test time.
-
- # We do this for a few iters, in this recipe.
- final_mdl=$dir/$x.mdl
- cur_alimdl=$dir/$x.mdl
- while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do
- echo "$0: building alignment model (pass $x)"
- if [ $x -eq $num_iters ]; then # 1st pass of building alimdl.
- flags=MwcS # don't update v the first time. Note-- we never update transitions.
- # they wouldn't change anyway as we use the same alignment as previously.
- else
- flags=vMwcS
- fi
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \
- ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
- sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \
- --utt2spk=ark:$sdata1/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \
- sgmm-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \
- $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1;
- $cmd $dir/log/update_ali.$x.log \
- sgmm-est --update-flags=$flags --remove-speaker-space=true --power=$power $cur_alimdl \
- "sgmm-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1;
- rm $dir/$x.*.aliacc || exit 1;
- [ $x -gt $num_iters ] && rm $dir/$x.alimdl
- fi
- cur_alimdl=$dir/$[$x+1].alimdl
- x=$[$x+1]
- done
- rm $dir/final.alimdl 2>/dev/null
- ln -s $x.alimdl $dir/final.alimdl
-fi
-
-utils/summarize_warnings.pl $dir/log
-
-echo Done
diff --git a/egs/wsj/s5/steps/train_mmi_sgmm.sh b/egs/wsj/s5/steps/train_mmi_sgmm.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-
-# MMI training (or optionally boosted MMI, if you give the --boost option),
-# for SGMMs. 4 iterations (by default) of Extended Baum-Welch update.
-#
-# Begin configuration section.
-cmd=run.pl
-num_iters=4
-boost=0.0
-cancel=true # if true, cancel num and den counts on each frame.
-acwt=0.1
-stage=0
-
-update_opts=
-transform_dir=
-# End configuration section
-
-echo "$0 $@" # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-if [ $# -ne 5 ]; then
- echo "Usage: steps/train_mmi_sgmm.sh <data> <lang> <ali> <denlats> <exp>"
- echo " e.g.: steps/train_mmi_sgmm.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi"
- echo "Main options (for others, see top of script file)"
- echo " --boost <boost-weight> # (e.g. 0.1), for boosted MMI. (default 0)"
- echo " --cancel (true|false) # cancel stats (true by default)"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- echo " --config <config-file> # config containing options"
- echo " --stage <stage> # stage to do partial re-run from."
- echo " --transform-dir <transform-dir> # directory to find fMLLR transforms."
- exit 1;
-fi
-
-data=$1
-lang=$2
-alidir=$3
-denlatdir=$4
-dir=$5
-mkdir -p $dir/log
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-nj=`cat $alidir/num_jobs` || exit 1;
-[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \
- echo "$alidir and $denlatdir have different num-jobs" && exit 1;
-
-sdata=$data/split$nj
-splice_opts=`cat $alidir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
-mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-cp $alidir/splice_opts $dir 2>/dev/null
-cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
-echo $nj > $dir/num_jobs
-
-cp $alidir/tree $dir
-cp $alidir/final.mdl $dir/0.mdl
-cp $alidir/final.alimdl $dir
-
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
-
-# Set up features
-
-if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
- cp $alidir/final.mat $dir
- ;;
- *) echo "Invalid feature type $feat_type" && exit 1;
-esac
-
-if [ ! -z "$transform_dir" ]; then
- echo "$0: using transforms from $transform_dir"
- [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \
- && exit 1;
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
-else
- echo "$0: no fMLLR transforms."
-fi
-
-if [ -f $alidir/vecs.1 ]; then
- echo "$0: using speaker vectors from $alidir"
- spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk"
-else
- echo "$0: no speaker vectors."
- spkvecs_opt=
-fi
-
-if [ -f $alidir/gselect.1.gz ]; then
- echo "$0: using Gaussian-selection info from $alidir"
- gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|"
-else
- echo "$0: error: no Gaussian-selection info found" && exit 1;
-fi
-
-lats="ark:gunzip -c $denlatdir/lat.JOB.gz|"
-if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then
- lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |"
-fi
-
-x=0
-while [ $x -lt $num_iters ]; do
- echo "Iteration $x of MMI training"
- # Note: the num and den states are accumulated at the same time, so we
- # can cancel them per frame.
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
- sgmm-rescore-lattice --speedup=true "$gselect_opt" $spkvecs_opt $dir/$x.mdl "$lats" "$feats" ark:- \| \
- lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
- sum-post --merge=$cancel --scale1=-1 \
- ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \
- sgmm-acc-stats2 "$gselect_opt" $spkvecs_opt $dir/$x.mdl "$feats" ark,s,cs:- \
- $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1;
-
- n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`;
- [ "$n" -ne $[$nj*2] ] && \
- echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1;
- $cmd $dir/log/den_acc_sum.$x.log \
- sgmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1;
- rm $dir/den_acc.$x.*.acc
- $cmd $dir/log/num_acc_sum.$x.log \
- sgmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1;
- rm $dir/num_acc.$x.*.acc
-
- $cmd $dir/log/update.$x.log \
- sgmm-est-ebw $update_opts $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1;
- fi
-
- # Some diagnostics: the objective function progress and auxiliary-function
- # improvement. Note: this code is same as in train_mmi.sh
- tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while(<STDIN>) { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf
- objf=`cat $dir/tmpf | awk '{print $1}'`;
- nf=`cat $dir/tmpf | awk '{print $2}'`;
- rm $dir/tmpf
- impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'`
- impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames.
- echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log
- x=$[$x+1]
-done
-
-echo "MMI training finished"
-
-rm $dir/final.mdl 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-
-exit 0;
diff --git a/egs/wsj/s5/steps/train_sgmm.sh b/egs/wsj/s5/steps/train_sgmm.sh
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
-
-# SGMM training, with speaker vectors. This script would normally be called on
-# top of fMLLR features obtained from a conventional system, but it also works
-# on top of any type of speaker-independent features (based on
-# deltas+delta-deltas or LDA+MLLT). For more info on SGMMs, see the paper "The
-# subspace Gaussian mixture model--A structured model for speech recognition".
-# (Computer Speech and Language, 2011).
-
-# Begin configuration section.
-nj=4
-cmd=run.pl
-stage=-6
-context_opts= # e.g. set it to "--context-width=5 --central-position=2" for a
-# quinphone system.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-num_iters=25 # Total number of iterations
-num_iters_alimdl=3 # Number of iterations for estimating alignment model.
-max_iter_inc=15 # Last iter to increase #substates on.
-realign_iters="5 10 15"; # Iters to realign on.
-spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
-increase_dim_iters="6 8"; # Iters on which to increase phn dim and/or spk dim;
- # rarely necessary, and if it is, only the 1st will normally be necessary.
-rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training.
-phn_dim= # You can use this to set the phonetic subspace dim. [default: feat-dim+1]
-spk_dim= # You can use this to set the speaker subspace dim. [default: feat-dim]
-power=0.25 # Exponent for number of gaussians according to occurrence counts
-beam=8
-retry_beam=40
-cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves
-# End configuration section.
-
-echo "$0 $@" # Print the command line for logging
-
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-
-if [ $# != 7 ]; then
- echo "Usage: steps/train_sgmm.sh <num-leaves> <num-substates> <data> <lang> <ali-dir> <ubm> <exp-dir>"
- echo " e.g.: steps/train_sgmm.sh 3500 10000 data/train_si84 data/lang \\"
- echo " exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a"
- echo "main options (for others, see top of script file)"
- echo " --config <config-file> # config containing options"
- echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
- echo " --silence-weight <sil-weight> # weight for silence (e.g. 0.5 or 0.0)"
- echo " --num-iters <#iters> # Number of iterations of E-M"
- exit 1;
-fi
-
-
-num_leaves=$1
-totsubstates=$2
-data=$3
-lang=$4
-alidir=$5
-ubm=$6
-dir=$7
-
-# Check some files.
-for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do
- [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-
-# Set some variables.
-oov=`cat $lang/oov.int`
-silphonelist=`cat $lang/phones/silence.csl`
-numsubstates=$num_leaves # Initial #-substates.
-incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates
-feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1;
-[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric.
-[ -z $phn_dim ] && phn_dim=$[$feat_dim+1]
-[ -z $spk_dim ] && spk_dim=$feat_dim
-nj=`cat $alidir/num_jobs` || exit 1;
-ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
-
-mkdir -p $dir/log
-echo $nj > $dir/num_jobs
-sdata=$data/split$nj;
-splice_opts=`cat $alidir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
-cp $alidir/splice_opts $dir 2>/dev/null
-cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-
-spkvecs_opt= # Empty option for now, until we estimate the speaker vectors.
-gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|"
-
-## Set up features.
-if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
- delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
- lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
- cp $alidir/final.mat $dir
- ;;
- *) echo "$0: invalid feature type $feat_type" && exit 1;
-esac
-if [ -f $alidir/trans.1 ]; then
- echo "$0: using transforms from $alidir"
- feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
-fi
-##
-
-
-if [ $stage -le -6 ]; then
- echo "$0: accumulating tree stats"
- $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
- acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
- "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
- [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
- sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
- rm $dir/*.treeacc
-fi
-
-if [ $stage -le -5 ]; then
- echo "$0: Getting questions for tree clustering."
- # preparing questions, roots file...
- cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
- cat $lang/phones/extra_questions.int >> $dir/questions.int
- compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
-
- echo "$0: Building the tree"
- $cmd $dir/log/build_tree.log \
- build-tree --verbose=1 --max-leaves=$num_leaves \
- --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
- $dir/questions.qst $lang/topo $dir/tree || exit 1;
-fi
-
-if [ $stage -le -4 ]; then
- echo "$0: Initializing the model"
- # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
- # will be truncated on initialization.
- $cmd $dir/log/init_sgmm.log \
- sgmm-init --phn-space-dim=$phn_dim --spk-space-dim=$spk_dim $lang/topo \
- $dir/tree $ubm $dir/0.mdl || exit 1;
-fi
-
-if [ $stage -le -3 ]; then
- echo "$0: doing Gaussian selection"
- $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
- sgmm-gselect $dir/0.mdl "$feats" \
- "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
-fi
-
-if [ $stage -le -2 ]; then
- echo "$0: compiling training graphs"
- text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text|"
- $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
- compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl $lang/L.fst \
- "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
-fi
-
-if [ $stage -le -1 ]; then
- echo "$0: Converting alignments"
- $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
- convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-fi
-
-x=0
-while [ $x -lt $num_iters ]; do
- echo "$0: training pass $x ... "
- if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
- echo "$0: re-aligning data"
- $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
- sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \
- --utt2spk=ark:$sdata/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \
- $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
- "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
- fi
- if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \
- ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
- weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
- sgmm-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata/JOB/spk2utt \
- $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \
- ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
- fi
- spkvecs_opt[$n]="--spk-vecs=ark:$dir/vecs.JOB"
- fi
- if [ $x -eq 0 ]; then
- flags=vwcSt # on the first iteration, don't update projections M or N
- elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then
- # Update N if we have speaker-vector space and x is odd,
- # and we've already updated the speaker vectors...
- flags=vNwcSt
- else
- # otherwise update M.
- flags=vMwcSt
- fi
-
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
- sgmm-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \
- --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \
- $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \
- $dir/$x.JOB.acc || exit 1;
- fi
-
- # The next option is needed if the user specifies a phone or speaker sub-space
- # dimension that's higher than the "normal" one.
- increase_dim_opts=
- if echo $increase_dim_iters | grep -w $x >/dev/null; then
- increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
- # Note: the command below might have a null effect on some iterations.
- if [ $spk_dim -gt $feat_dim ]; then
- cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
- copy-vector --print-args=false --change-dim=$spk_dim \
- ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
- mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1;
- fi
- fi
-
- if [ $stage -le $x ]; then
- $cmd $dir/log/update.$x.log \
- sgmm-est --update-flags=$flags --split-substates=$numsubstates $increase_dim_opts \
- --power=$power --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.*.acc|" \
- $dir/$[$x+1].mdl || exit 1;
- rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null
- fi
-
- if [ $x -lt $max_iter_inc ]; then
- numsubstates=$[$numsubstates+$incsubstates]
- fi
- x=$[$x+1];
-done
-
-rm $dir/final.mdl $dir/final.occs 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-ln -s $x.occs $dir/final.occs
-
-if [ $spk_dim -gt 0 ]; then
- # We need to create an "alignment model" that's been trained
- # without the speaker vectors, to do the first-pass decoding with.
- # in test time.
-
- # We do this for a few iters, in this recipe.
- final_mdl=$dir/$x.mdl
- cur_alimdl=$dir/$x.mdl
- while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do
- echo "$0: building alignment model (pass $x)"
- if [ $x -eq $num_iters ]; then # 1st pass of building alimdl.
- flags=MwcS # don't update v the first time. Note-- we never update transitions.
- # they wouldn't change anyway as we use the same alignment as previously.
- else
- flags=vMwcS
- fi
- if [ $stage -le $x ]; then
- $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \
- ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
- sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \
- --utt2spk=ark:$sdata/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \
- sgmm-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \
- $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1;
- $cmd $dir/log/update_ali.$x.log \
- sgmm-est --update-flags=$flags --remove-speaker-space=true --power=$power $cur_alimdl \
- "sgmm-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1;
- rm $dir/$x.*.aliacc || exit 1;
- [ $x -gt $num_iters ] && rm $dir/$x.alimdl
- fi
- cur_alimdl=$dir/$[$x+1].alimdl
- x=$[$x+1]
- done
- rm $dir/final.alimdl 2>/dev/null
- ln -s $x.alimdl $dir/final.alimdl
-fi
-
-utils/summarize_warnings.pl $dir/log
-
-echo Done
diff --git a/src/Doxyfile b/src/Doxyfile
index f5e874be3ad18d5bc575ebb0e160aa0c3cfaa7cd..bf2dc5197e2f6e00a7c3e0d55510dade4e0d0475 100644 (file)
--- a/src/Doxyfile
+++ b/src/Doxyfile
# the lines after "doc itf" are copied from SUBDIRS in the Makefile.
INPUT = doc itf \
- base matrix util feat tree thread gmm transform sgmm \
+ base matrix util feat tree thread gmm transform \
fstext hmm lm decoder lat cudamatrix nnet \
- bin fstbin gmmbin fgmmbin sgmmbin featbin \
+ bin fstbin gmmbin fgmmbin featbin \
nnetbin latbin sgmm2 sgmm2bin nnet2 nnet2bin nnet3 nnet3bin \
kwsbin ivector ivectorbin
diff --git a/src/Makefile b/src/Makefile
index 9905be869a025a12211e712b8083363b8e61d92c..8bc18b254e9b04cceee3429eabaa22a0cf7c48be 100644 (file)
--- a/src/Makefile
+++ b/src/Makefile
SHELL := /bin/bash
-SUBDIRS = base matrix util feat tree thread gmm transform sgmm \
+SUBDIRS = base matrix util feat tree thread gmm transform \
fstext hmm lm decoder lat kws cudamatrix nnet \
- bin fstbin gmmbin fgmmbin sgmmbin featbin \
+ bin fstbin gmmbin fgmmbin featbin \
nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 chain nnet3bin nnet2bin kwsbin \
ivector ivectorbin online2 online2bin lmbin chainbin
-MEMTESTDIRS = base matrix util feat tree thread gmm transform sgmm \
+MEMTESTDIRS = base matrix util feat tree thread gmm transform \
fstext hmm lm decoder lat nnet kws chain \
- bin fstbin gmmbin fgmmbin sgmmbin featbin \
+ bin fstbin gmmbin fgmmbin featbin \
nnetbin latbin sgmm2 nnet2 nnet3 nnet2bin nnet3bin sgmm2bin kwsbin \
ivector ivectorbin online2 online2bin lmbin
# this is necessary for correct parallel compilation
#1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \
- base matrix util feat tree thread gmm transform sgmm sgmm2 fstext hmm \
+bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \
+ base matrix util feat tree thread gmm transform sgmm2 fstext hmm \
lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2
#2)The libraries have inter-dependencies
tree: base util thread matrix
gmm: base util matrix tree thread
transform: base util matrix gmm tree thread
-sgmm: base util matrix gmm tree transform thread hmm
sgmm2: base util matrix gmm tree transform thread hmm
fstext: base util thread matrix tree
hmm: base tree matrix util thread
lm: base util thread matrix fstext
-decoder: base util thread matrix gmm sgmm hmm tree transform lat
+decoder: base util thread matrix gmm hmm tree transform lat
lat: base util thread hmm tree matrix
cudamatrix: base util thread matrix
nnet: base util hmm tree thread matrix cudamatrix
@@ -180,8 +179,8 @@ nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain fstex
chain: lat hmm tree fstext matrix cudamatrix util thread base
ivector: base util matrix thread transform tree gmm
#3)Dependencies for optional parts of Kaldi
-onlinebin: base matrix util feat tree gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread
-# python-kaldi-decoding: base matrix util feat tree thread gmm transform sgmm sgmm2 fstext hmm decoder lat online
+onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread
+# python-kaldi-decoding: base matrix util feat tree thread gmm transform sgmm2 fstext hmm decoder lat online
online: decoder gmm transform feat matrix util base lat hmm thread tree
online2: decoder gmm transform feat matrix util base lat hmm thread tree ivector cudamatrix nnet2 nnet3 chain
kws: base util thread hmm tree matrix lat
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
index fe489d1cb3f97d07b60a5cbd9734b24fa943cc9a..93db701cb7a86ecd094f6718743a52922d86c11e 100644 (file)
--- a/src/decoder/Makefile
+++ b/src/decoder/Makefile
LIBNAME = kaldi-decoder
-ADDLIBS = ../lat/kaldi-lat.a ../sgmm/kaldi-sgmm.a ../hmm/kaldi-hmm.a \
+ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
../matrix/kaldi-matrix.a ../base/kaldi-base.a
index 75a58011b1d7506e7e4b198931e2885a08f89f7c..49c9fb69e4210c5a0134e6583507d7925e5a9dc6 100644 (file)
export KALDI_ROOT=`pwd`/../..
# Setting paths to useful tools
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
# Defining audio data directory (modify it for your installation directory!)
export DATA_ROOT="/home/{user}/kaldi-trunk/egs/digits/digits_audio"
diff --git a/src/feat/Makefile b/src/feat/Makefile
index 71a341923473a6a92c403708443289f315e82fb5..e987de55b385cc313bb14fe1e0605466191d0ffc 100644 (file)
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \
feature-functions-test pitch-functions-test feature-sdc-test \
- resample-test online-feature-test sinusoid-detection-test \
- signal-test
+ resample-test online-feature-test signal-test
OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
feature-spectrogram.o mel-computations.o wave-reader.o \
- pitch-functions.o resample.o online-feature.o sinusoid-detection.o \
- signal.o feature-window.o
+ pitch-functions.o resample.o online-feature.o signal.o \
+ feature-window.o
LIBNAME = kaldi-feat
diff --git a/src/feat/sinusoid-detection-test.cc b/src/feat/sinusoid-detection-test.cc
+++ /dev/null
@@ -1,452 +0,0 @@
-// feat/sinusoid-detection-test.cc
-
-// Copyright 2015 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <iostream>
-
-#include "base/kaldi-math.h"
-#include "feat/sinusoid-detection.h"
-
-
-namespace kaldi {
-
-// this function is used for testing AddSinusoid.
-void AddSinusoidSimple(BaseFloat samp_freq,
- const Sinusoid &sinusoid,
- VectorBase<BaseFloat> *signal) {
- for (int32 i = 0; i < signal->Dim(); i++)
- (*signal)(i) += sinusoid.amplitude *
- cos(M_2PI * sinusoid.freq / samp_freq * i + sinusoid.phase);
-}
-
-void UnitTestAddSinusoid() {
- BaseFloat samp_freq = 560.1;
- int32 length = 511;
- Vector<BaseFloat> orig(length);
- orig.SetRandn();
- Vector<BaseFloat> orig2(orig);
- Sinusoid sinusoid(49.20, 2.111, 1.5);
-
- AddSinusoid(samp_freq, sinusoid, &orig);
- AddSinusoidSimple(samp_freq, sinusoid, &orig2);
- AssertEqual(orig, orig2);
-}
-
-
-
-void UnitTestQuadraticMaximizeEqualSpaced() {
- for (int32 n = 0; n < 50; n++) {
-
- // Let the cubic function be y = a x^2 + b x + c, and let
- // y0,y1,y2 be its values evaluated at x = [0, 1, 2]; we
- // want it evaluated at arbitrary x.
-
- BaseFloat a = -0.5 + RandUniform(), b = -0.5 + RandUniform(), c = -0.5 + RandUniform();
- BaseFloat y[3];
- for (int32 i = 0; i < 3; i++) {
- BaseFloat x = i;
- y[i] = a * x * x + b * x + c;
- }
- BaseFloat x_max, y_max;
- SinusoidDetector::QuadraticMaximizeEqualSpaced(y[0], y[1], y[2], &x_max, &y_max);
-
- for (int32 m = 0; m <= 10; m++) {
- BaseFloat x_test = 0.1 * m;
- BaseFloat y_test = a * x_test * x_test + b * x_test + c;
- KALDI_ASSERT(y_test <= y_max + 1.0e-05);
- }
- }
-}
-
-void UnitTestQuadraticMaximize() {
- for (int32 n = 0; n < 50; n++) {
-
- // Let the cubic function be y = a x^2 + b x + c, and let
- // y0,y1,y2 be its values evaluated at x = [0, 1, 2]; we
- // want it evaluated at arbitrary x.
-
- BaseFloat a = -0.5 + RandUniform(), b = -0.5 + RandUniform(), c = -0.5 + RandUniform(),
- x = 0.1 + RandUniform() * 0.98;
- BaseFloat y[3];
- for (int32 i = 0; i < 3; i++) {
- BaseFloat this_x;
- if (i == 0) { this_x = 0.0; }
- else if (i == 1) { this_x = x; }
- else { this_x = 1.0; }
- y[i] = a * this_x * this_x + b * this_x + c;
- }
- BaseFloat x_max, y_max;
- SinusoidDetector::QuadraticMaximize(x, y[0], y[1], y[2], &x_max, &y_max);
-
- for (int32 m = 0; m <= 10; m++) {
- BaseFloat x_test = 0.1 * m;
- BaseFloat y_test = a * x_test * x_test + b * x_test + c;
- if (n < 100 && m == 5) {
- KALDI_VLOG(2) << "Checking y_test <= y_max: "
- << y_test << " <= " << y_max << " [x_max = "
- << x_max << "]";
- KALDI_ASSERT(y_test <= y_max + 1.0e-05);
- }
- }
- }
-}
-
-
-void UnitTestSinusoidDetector() {
- BaseFloat samp_freq = 4000 + (rand() % 2000);
- int32 num_samp = 128 + rand() % 400;
- SinusoidDetector detector(samp_freq, num_samp);
-
- for (int32 i = 0; i < 40; i++) {
-
- Vector<BaseFloat> signal(num_samp);
-
- // Sinusoid ref_sinusoid(1.3, 312.5, M_PI * 0.0);
- // Sinusoid ref_sinusoid(1.3, 324.125, M_PI * 0.5);
-
- BaseFloat nyquist = samp_freq * 0.5;
- BaseFloat freq = nyquist * RandUniform();
- BaseFloat amplitude = RandUniform();
- BaseFloat phase = M_2PI * RandUniform();
-
- Sinusoid ref_sinusoid(amplitude, freq, phase);
-
- AddSinusoid(samp_freq, ref_sinusoid, &signal);
-
-
- BaseFloat orig_energy = VecVec(signal, signal);
- KALDI_LOG << "Real frequency is " << freq << ", amplitude "
- << amplitude << ", phase " << phase << ", samp-freq "
- << samp_freq;
- KALDI_LOG << "Total energy of signal (with sinusoid) is " << orig_energy;
-
- Sinusoid sinusoid;
- BaseFloat min_energy = 0.0;
- BaseFloat energy = detector.DetectSinusoid(min_energy,
- signal, &sinusoid);
-
- Vector<BaseFloat> new_signal(signal);
- sinusoid.phase += M_PI; // Reverse the phase.
- AddSinusoid(samp_freq, sinusoid, &new_signal);
- BaseFloat delta_energy = VecVec(signal, signal) -
- VecVec(new_signal, new_signal);
- KALDI_LOG << "Projected delta energy = " << energy
- << " and observed was " << delta_energy;
-
- BaseFloat remaining_energy = VecVec(new_signal, new_signal);
- if (remaining_energy > 0.01 * orig_energy) {
- KALDI_WARN << "Energy remaining is " << remaining_energy
- << " vs. original " << orig_energy;
- BaseFloat relative_freq = freq / nyquist;
- BaseFloat inv_num_samp = 1.0 / num_samp;
- // We only tolerate this kind of error for very ridiculous frequency,
- // close to zero or the Nyquist.
- KALDI_ASSERT(relative_freq < inv_num_samp ||
- relative_freq > 1.0 - inv_num_samp);
- }
- }
-}
-
-// as UnitTestSinusoidDetector(), but doing it in noisy signals.
-void UnitTestSinusoidDetectorNoisy() {
- BaseFloat samp_freq = 4000 + (rand() % 2000);
- int32 num_samp = 128 + rand() % 400;
- SinusoidDetector detector(samp_freq, num_samp);
-
- for (int32 i = 0; i < 40; i++) {
-
- Vector<BaseFloat> signal(num_samp);
-
- signal.SetRandn();
-
- BaseFloat rand_energy = VecVec(signal, signal);
-
- // Sinusoid ref_sinusoid(1.3, 312.5, M_PI * 0.0);
- // Sinusoid ref_sinusoid(1.3, 324.125, M_PI * 0.5);
-
- BaseFloat nyquist = samp_freq * 0.5;
- BaseFloat freq = nyquist * RandUniform();
- BaseFloat amplitude = 10.0 * RandUniform();
- BaseFloat phase = M_2PI * RandUniform();
-
- Sinusoid ref_sinusoid(amplitude, freq, phase);
-
- AddSinusoid(samp_freq, ref_sinusoid, &signal);
-
- BaseFloat tot_energy = VecVec(signal, signal);
-
- KALDI_LOG << "Real frequency is " << freq << ", amplitude "
- << amplitude << ", phase " << phase << ", samp-freq "
- << samp_freq;
- KALDI_LOG << "Total energy of signal (with noise + sinusoid) is " << tot_energy;
-
- Sinusoid sinusoid;
- BaseFloat min_energy = 0.0;
- BaseFloat energy = detector.DetectSinusoid(min_energy,
- signal, &sinusoid);
-
- Vector<BaseFloat> new_signal(signal);
- sinusoid.phase += M_PI; // reverse the phase.
- AddSinusoid(samp_freq, sinusoid, &new_signal);
- BaseFloat delta_energy = VecVec(signal, signal) -
- VecVec(new_signal, new_signal);
- KALDI_LOG << "Projected delta energy = " << energy
- << " and observed was " << delta_energy;
-
- BaseFloat min_energy_diff = 0.99 * (tot_energy - rand_energy);
-
- if (delta_energy < min_energy_diff) {
- KALDI_WARN << "Energy reduction is " << delta_energy
- << " vs. expected " << (tot_energy - rand_energy);
- BaseFloat relative_freq = freq / nyquist;
- BaseFloat inv_num_samp = 1.0 / num_samp;
- // We only tolerate this kind of error for very ridiculous frequency,
- // close to zero or the Nyquist.
- KALDI_ASSERT(relative_freq < inv_num_samp ||
- relative_freq > 1.0 - inv_num_samp);
- }
- }
-}
-
-
-void AddFreqToSignal(BaseFloat base_freq,
- BaseFloat samp_freq,
- BaseFloat tolerance,
- BaseFloat gain,
- VectorBase<BaseFloat> *signal) {
- BaseFloat error_scale = (2 * RandUniform() - 1) * tolerance;
- BaseFloat freq = base_freq * (1.0 + error_scale);
- KALDI_VLOG(3) << "base-freq = " << base_freq << ", factor = " << error_scale;
- for (int32 i = 0; i < signal->Dim(); i++)
- (*signal)(i) += gain * sin(i * 2.0 * 3.14159 * freq / samp_freq);
-}
-
-
-void GenerateDtmfTestCase(
- BaseFloat sampling_rate,
- Vector<BaseFloat> *signal,
- std::vector<MultiSinusoidDetectorOutput> *ref_output) {
- // the "ref_output" should correlate with the first of each run of frames with the same label.
-
- BaseFloat min_duration_secs = 0.04; // min duration of dtmf or non-tone segments.
- BaseFloat min_dialtone_duration_secs = 0.1;
- BaseFloat frequency_tolerance = 0.035;
- BaseFloat dialtone_frequency_tolerance = 0.4 * (440.0 - 425.0) / 440.0;
-
- int32 num_events = 2 * (5 + rand() % 5) + 1; // odd number.
- int32 tot_signal_dim = 0;
-
- ref_output->resize(num_events);
- std::vector<Vector<BaseFloat> > all_signals(num_events);
- for (int32 i = 0; i < num_events; i++) {
- MultiSinusoidDetectorOutput &this_output = (*ref_output)[i];
- Vector<BaseFloat> &this_signal = all_signals[i];
- BaseFloat duration_secs = min_duration_secs * (1 + rand() % 3);
- int32 num_samp = sampling_rate * duration_secs;
- tot_signal_dim += num_samp;
-
- this_signal.Resize(num_samp);
- this_signal.SetRandn();
-
- if (i % 2 == 0); // do nothing;
- else if (rand() % 2 == 0 && duration_secs >= min_dialtone_duration_secs) {
- // dialtone.
- BaseFloat freq;
- if (rand() % 3 == 0) { freq = 350; }
- else if (rand() % 2 == 0) { freq = 440; }
- else { freq = 425; }
- BaseFloat gain = 10.0 * (1.0 + rand() % 2);
- AddFreqToSignal(freq, sampling_rate, dialtone_frequency_tolerance,
- gain, &(this_signal));
- this_output.freq1 = freq;
- } else {
- // dtmf. use a subset of tones as examples.
- BaseFloat freq1, freq2;
- char c;
- if (rand() % 4 == 0) {
- c = '8'; freq1 = 852; freq2 = 1336;
- } else if (rand() % 3 == 0) {
- c = '0'; freq1 = 941; freq2 = 1336;
- } else if (rand() % 2 == 0) {
- c = '#'; freq1 = 941; freq2 = 1477;
- } else {
- c = '1'; freq1 = 697; freq2 = 1209;
- }
- BaseFloat base_gain = 10.0 * (1.0 + (rand() % 3)),
- gain_factor = 1.0 + 0.1 * (-2 + rand() % 5),
- gain1 = base_gain, gain2 = gain_factor * base_gain;
- AddFreqToSignal(freq1, sampling_rate, frequency_tolerance, gain1,
- &(this_signal));
- AddFreqToSignal(freq2, sampling_rate, frequency_tolerance, gain2,
- &(this_signal));
- this_output.freq1 = freq1;
- this_output.freq2 = freq2;
- }
- }
- signal->Resize(tot_signal_dim);
- int32 signal_offset = 0;
- for (int32 i = 0; i < num_events; i++) {
- int32 this_dim = all_signals[i].Dim();
- signal->Range(signal_offset, this_dim).CopyFromVec(all_signals[i]);
- signal_offset += this_dim;
- }
-}
-
-
-/*
-
-// Just a basic test to check that it produces output.
-
-void UnitTestToneDetection() {
- BaseFloat samp_freq = (rand() % 2) == 0 ? 8000 : 16000;
- ToneDetectionConfig config;
-
- int32 num_frames = 100 + (rand() % 100);
- int32 frame_length = static_cast<int32>(samp_freq * config.frame_length_secs);
-
- int32 num_samples = frame_length * num_frames + rand() % frame_length;
- Vector<BaseFloat> signal(num_samples);
- signal.SetRandn();
-
- ToneDetector tone_detector(config, samp_freq);
-
- int32 signal_offset = 0;
-
- std::vector<ToneDetectorOutput*> tone_detector_output;
-
- while (signal_offset < num_samples) {
- int32 signal_remaining = num_samples - signal_offset,
- chunk_size = std::min<int32>((rand() % 200) + 100,
- signal_remaining);
- SubVector<BaseFloat> signal_part(signal, signal_offset, chunk_size);
- tone_detector.AcceptWaveform(signal_part);
- signal_offset += chunk_size;
-
- if (signal_offset == num_samples)
- tone_detector.WaveformFinished();
- while (!tone_detector.Done() &&
- (rand() % 2 == 0 || signal_offset == num_samples)) {
- ToneDetectorOutput *output = new ToneDetectorOutput();
- tone_detector.GetNextFrame(output);
- tone_detector_output.push_back(output);
- }
- }
- KALDI_ASSERT(signal_offset == num_samples);
-
- Vector<BaseFloat> signal2(signal.Dim());
- signal_offset = 0;
- for (int32 i = 0; i < tone_detector_output.size(); i++) {
- ToneDetectorOutput *output = tone_detector_output[i];
- signal2.Range(signal_offset,
- output->signal.Dim()).CopyFromVec(output->signal);
- signal_offset += output->signal.Dim();
- if (output->frame_type != 'n') {
- KALDI_ERR << "Frame " << i << " badly classified, should be 'n', is: "
- << output->frame_type;
- }
- delete output;
- }
- KALDI_ASSERT(signal_offset == num_samples &&
- signal.ApproxEqual(signal2, 1.0e-10));
-
-}
-
-std::ostringstream & operator << (std::ostringstream &ostr,
- const ToneDetectorOutput &output) {
- ostr << output.frame_type;
- if (output.frame_type == 'd')
- ostr << output.dialtone_freq;
- ostr << ' ';
- return ostr;
-}
-
-*/
-
-
-// This version of the unit-test generates a signal that has tones in it, and
-// runs the detection on that signal.
-void UnitTestToneDetection2() {
- BaseFloat samp_freq = (rand() % 2) == 0 ? 8000 : 16000;
- Vector<BaseFloat> signal;
- std::vector<MultiSinusoidDetectorOutput> ref_output;
- GenerateDtmfTestCase(samp_freq, &signal, &ref_output);
-
- MultiSinusoidDetectorConfig config;
-
- int32 num_samples = signal.Dim();
- KALDI_ASSERT(num_samples > 0);
-
- MultiSinusoidDetector multi_sinusoid_detector(config, samp_freq);
-
- int32 signal_offset = 0;
-
- std::vector<MultiSinusoidDetectorOutput*> multi_sinusoid_detector_output;
-
- while (signal_offset < num_samples) {
- int32 signal_remaining = num_samples - signal_offset,
- chunk_size = std::min<int32>((rand() % 200) + 100,
- signal_remaining);
- SubVector<BaseFloat> signal_part(signal, signal_offset, chunk_size);
- multi_sinusoid_detector.AcceptWaveform(signal_part);
- signal_offset += chunk_size;
-
- if (signal_offset == num_samples)
- multi_sinusoid_detector.WaveformFinished();
- while (!multi_sinusoid_detector.Done() &&
- (rand() % 2 == 0 || signal_offset == num_samples)) {
- MultiSinusoidDetectorOutput *output = new MultiSinusoidDetectorOutput();
- multi_sinusoid_detector.GetNextFrame(output);
- multi_sinusoid_detector_output.push_back(output);
- }
- }
- KALDI_ASSERT(signal_offset == num_samples);
-
- // std::ostringstream str_ref, str_hyp;
- //for (size_t i = 0; i < ref_output.size(); i++)
- // str_ref << ref_output[i];
-
-
- for (size_t i = 0; i < multi_sinusoid_detector_output.size(); i++) {
- MultiSinusoidDetectorOutput *output = multi_sinusoid_detector_output[i];
- KALDI_LOG << "tot-energy = " << output->tot_energy
- << ", freq1 " << output->freq1 << ", energy1 " << output->energy1
- << ", freq2 " << output->freq2 << ", energy2 " << output->energy2;
- delete output;
- }
-}
-
-
-
-} // namespace kaldi
-
-int main() {
- using namespace kaldi;
-
- SetVerboseLevel(4);
-
- UnitTestToneDetection2();
- UnitTestAddSinusoid();
- UnitTestQuadraticMaximizeEqualSpaced();
- UnitTestQuadraticMaximize();
- for (int32 i = 0; i < 10; i++) {
- UnitTestSinusoidDetector();
- UnitTestSinusoidDetectorNoisy();
- }
-
-}
diff --git a/src/feat/sinusoid-detection.cc b/src/feat/sinusoid-detection.cc
+++ /dev/null
@@ -1,945 +0,0 @@
-// feat/sinusoid-detection.cc
-
-// Copyright 2015 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "feat/sinusoid-detection.h"
-#include "matrix/matrix-functions.h"
-#include "feat/resample.h"
-
-namespace kaldi {
-
-
-
-// This function adds the given sinusoid to the signal, as:
-// (*signal)(t) += amplitude * cos(2 pi freq/samp_freq t + phase).
-void AddSinusoid(BaseFloat samp_freq,
- const Sinusoid &sinusoid,
- VectorBase<BaseFloat> *signal) {
- // treat "factor" as a complex variable equal to exp(i * 2 pi freq / samp_freq); it's
- // the factor by which we multiply on each frame.
- BaseFloat factor_real = cos(M_2PI * sinusoid.freq / samp_freq),
- factor_im = sin(M_2PI * sinusoid.freq / samp_freq);
- BaseFloat *signal_data = signal->Data();
- int32 dim = signal->Dim(), batch_size = 100;
- // process frames in batches of size "batch_size", after which we recompute
- // the starting point to prevent loss of accuracy due to drift.
- for (int32 b = 0; b * batch_size < dim; b++) {
- int32 t_offset = b * batch_size,
- t_end = std::min(dim, t_offset + batch_size);
- double phase = sinusoid.phase + M_2PI * t_offset * sinusoid.freq / samp_freq;
- // treat x as a complex variable which initially is equal to amplitude * exp(i * phase),
- // but which gets multiplied by "factor" on each frame.
- BaseFloat x_real = sinusoid.amplitude * cos(phase),
- x_im = sinusoid.amplitude * sin(phase);
- for (int32 t = t_offset; t < t_end; t++) {
- signal_data[t] += x_real;
- ComplexMul(factor_real, factor_im, &x_real, &x_im); // x *= factor.
- }
- }
-}
-
-
-// static
-void SinusoidDetector::QuadraticMaximizeEqualSpaced(
- BaseFloat y0, BaseFloat y1, BaseFloat y2,
- BaseFloat *x_max, BaseFloat *y_max) {
- // Let the function be y = a x^2 + b x + c, and
- // suppose we have the values of y(0), y(1) and y(2).
- // We have y0 = c, y1 = a + b + c, and y2 = 4a + 2b + c,
- // so c = y0.
- // Also, y2 - 2 y1 = 2a - c, so
- // a = (y2 - 2 y1 + c) / 2, and
- // b = y1 - a - c.
- BaseFloat c = y0, a = y2 - 2 * y1 + c, b = y1 - a - c;
- if (a >= 0) {
- // The maximum of the function will occur at one of the end points.
- if (y0 > y2) {
- *x_max = 0;
- *y_max = y0;
- } else {
- *x_max = 2;
- *y_max = y2;
- }
- } else {
- // derivative y' = 2a x + b. y' = 0 at x = -b / 2 a.
- BaseFloat x = -b / (2.0 * a);
- if (x <= 0.0) {
- *x_max = 0;
- *y_max = y0;
- } else if (x >= 2.0) {
- *x_max = 0;
- *y_max = y2;
- } else {
- *x_max = x;
- *y_max = a * x * x + b * x + c;
- }
- }
-}
-
-// static
-void SinusoidDetector::QuadraticMaximize(
- BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2,
- BaseFloat *x_max, BaseFloat *y_max) {
- // Let the function be y = a x^2 + b x + c, and
- // suppose we have the values of y(0), y(x1) and y(1),
- // where 0 < x1 < 1.
- // We have y0 = c, y1 = x1^2 a + x1 b + c, and y2 = a + b + c,
- // so c = y0.
- // Also, x1.y2 - y1 = a (x1 - x1^2) + (x1 - 1) c, so
- // a = ( (x1 y2 - y1) - (x1 - 1) c) / (x1 - x1^2), and
- // b = y2 - a - c.
- BaseFloat c = y0,
- a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1),
- b = y2 - a - c;
-
- // TODO: remove these lines.
- AssertEqual(y1, a * x1 * x1 + b * x1 + c);
- AssertEqual(y2, a + b + c);
-
- if (a >= 0) {
- // The maximum of the function will occur at one of the end points.
- if (y0 > y2) {
- *x_max = 0;
- *y_max = y0;
- } else {
- *x_max = 1.0;
- *y_max = y2;
- }
- } else {
- // derivative y' = 2a x + b. y' = 0 at x = -b / 2 a.
- BaseFloat x = -b / (2.0 * a);
- if (x <= 0.0) {
- *x_max = 0.0;
- *y_max = y0;
- } else if (x >= 1.0) {
- *x_max = 1.0;
- *y_max = y2;
- } else {
- *x_max = x;
- *y_max = a * x * x + b * x + c;
- }
- }
-}
-
-//static
-BaseFloat SinusoidDetector::QuadraticInterpolate(
- BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2,
- BaseFloat x) {
- // Let the function be y = a x^2 + b x + c, and
- // suppose we have the values of y(0), y(x1) and y(1),
- // where 0 < x1 < 1.
- // We have y0 = c, y1 = x1^2 a + x1 b + c, and y2 = a + b + c,
- // so c = y0.
- // Also, x1.y2 - y1 = a (x1 - x1^2) + (x1 - 1) c, so
- // a = ( (x1 y2 - y1) - (x1 - 1) c) / (x1 - x1^2), and
- // b = y2 - a - c.
- KALDI_ASSERT(x1 >= 0.0 && x1 <= 1.0);
- if (x1 == 0.0) return y0;
- else if (x1 == 1.0) return y2;
-
- BaseFloat c = y0,
- a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1),
- b = y2 - a - c;
- return a * x * x + b * x + c;
-}
-
-// This function does
-// (*cos)(t) = cos(2 pi t freq / samp_freq)
-// (*sin)(t) = sin(2 pi t freq / samp_freq)
-//static
-void SinusoidDetector::CreateCosAndSin(BaseFloat samp_freq,
- BaseFloat freq,
- VectorBase<BaseFloat> *cos_vec,
- VectorBase<BaseFloat> *sin_vec) {
- int32 dim = cos_vec->Dim(), batch_size = 100;
- KALDI_ASSERT(dim == sin_vec->Dim());
- BaseFloat *cos_data = cos_vec->Data(), *sin_data = sin_vec->Data();
- BaseFloat factor_real = cos(M_2PI * freq / samp_freq),
- factor_im = sin(M_2PI * freq / samp_freq);
-
- // process frames in batches of size "batch_size", after which we recompute
- // the starting point to prevent loss of accuracy due to drift.
- for (int32 b = 0; b * batch_size < dim; b++) {
- int32 t_offset = b * batch_size,
- t_end = std::min(dim, t_offset + batch_size);
- double phase = M_2PI * t_offset * freq / samp_freq;
- // treat x as a complex variable which initially is equal to amplitude * exp(i * phase),
- // but which gets multiplied by "factor" on each frame.
- BaseFloat x_real = cos(phase), x_im = sin(phase);
- for (int32 t = t_offset; t < t_end; t++) {
- cos_data[t] = x_real;
- sin_data[t] = x_im;
- ComplexMul(factor_real, factor_im, &x_real, &x_im); // x *= factor.
- }
- }
-}
-
-SinusoidDetector::SinusoidDetector(BaseFloat samp_freq,
- int32 num_samp):
- samp_freq_(samp_freq),
- num_samples_(num_samp),
- num_samples_padded_(RoundUpToNearestPowerOfTwo(num_samp)),
- fft_(num_samples_padded_),
- factor1_(3.1),
- factor2_(1.42) {
- ComputeCoefficients();
-}
-
-void SinusoidDetector::SelfTest(
- const VectorBase<BaseFloat> &signal,
- const std::vector<InfoForBin> &info,
- BaseFloat final_freq,
- BaseFloat final_energy) {
- int32 num_bins = num_samples_padded_ * 2 + 1;
-
-
- {
- BaseFloat cutoff = 0.0;
- for (int32 k = 0; k <= num_bins; k += 4)
- cutoff = std::max(cutoff, info[k].energy);
- BaseFloat energy_upper_bound = factor1_ * cutoff;
- if (final_energy > energy_upper_bound) {
- KALDI_WARN << "Self-testing failed [factor1]: "
- << final_energy << " > " << energy_upper_bound
- << ", num-samples is " << num_samples_
- << ", freq/nyquist = "
- << (final_freq / (samp_freq_ * 0.5))
- << "- would require factor1 >= "
- << (final_energy / cutoff);
- }
- }
- {
- BaseFloat cutoff = 0.0;
- for (int32 k = 0; k <= num_bins; k += 2)
- if (info[k].valid)
- cutoff = std::max(cutoff, info[k].energy);
- BaseFloat energy_upper_bound = factor2_ * cutoff;
- if (final_energy > energy_upper_bound) {
- KALDI_WARN << "Self-testing failed [factor2]: "
- << final_energy << " > " << energy_upper_bound
- << ", num-samples is " << num_samples_
- << ", freq/nyquist = "
- << (final_freq / (samp_freq_ * 0.5))
- << "- would require factor2 >= "
- << (final_energy / cutoff);
-
- }
- }
-
-}
-
-
-BaseFloat SinusoidDetector::OptimizeFrequency(
- const std::vector<InfoForBin> &info,
- int32 *bin_out,
- BaseFloat *offset_out) const {
-
- BaseFloat max_energy = 0.0;
- *bin_out = -1;
- int32 max_freq = num_samples_padded_ * 2;
-
- // For each bin, we consider the frequency range [bin, bin+1, bin+2],
- // and if we have info for all those bins, do a quadratic interpolation to
- // find the maximum within the range.
- for (int32 bin = 0; bin + 2 <= max_freq; bin++) {
- if (info[bin].valid && info[bin+1].valid && info[bin+2].valid) {
- // First handle the left side of the bin.
- BaseFloat best_x, best_y;
- QuadraticMaximizeEqualSpaced(info[bin].energy, info[bin+1].energy,
- info[bin+2].energy, &best_x, &best_y);
- if (best_y > max_energy) {
- max_energy = best_y;
- if (best_x <= 1.0) {
- *bin_out = bin;
- *offset_out = best_x;
- } else {
- *bin_out = bin + 1;
- *offset_out = best_x - 1;
- }
- }
- }
- }
- return max_energy;
-}
-
-
-BaseFloat SinusoidDetector::DetectSinusoid(
- BaseFloat min_energy,
- const VectorBase<BaseFloat> &signal,
- Sinusoid *sinusoid) {
- if (signal(0) == 0.0 && signal.Norm(2.0) == 0.0)
- return 0.0;
- KALDI_ASSERT(signal.Dim() == num_samples_);
- Vector<BaseFloat> fft(num_samples_padded_);
- fft.Range(0, num_samples_).CopyFromVec(signal);
- bool forward = true;
- fft_.Compute(fft.Data(), forward);
-
- std::vector<InfoForBin> info;
- ComputeCoarseInfo(fft, &info);
- // we now have info for the "coarse" bins.
-
- // each element b of "bins" will be a multiple of 4: it's possible
- // that the best frequency is in the range [b, b+4]
- std::vector<int32> bins;
- FindCandidateBins(min_energy, info, &bins);
-
- if (bins.empty())
- return 0.0; // not enough energy in signal.
-
- for (size_t i = 0; i < bins.size(); i++) {
- int32 bin = bins[i];
- ComputeBinInfo(signal, bin, &(info[bin]));
- }
-
- std::vector<int32> bins2;
- FindCandidateBins2(min_energy, info, &bins2);
-
- for (size_t i = 0; i < bins2.size(); i++) {
- int32 bin = bins2[i];
- ComputeBinInfo(signal, bin, &(info[bin]));
- }
-
- // compute energy for the predicted-optimum point, which will usually be
- // between bins, with an offset.
- int32 bin;
- BaseFloat offset;
-
- BaseFloat opt_energy = OptimizeFrequency(info, &bin, &offset);
-
- if (opt_energy == 0.0)
- return 0.0;
-
- BaseFloat max_freq = (bin + offset) * samp_freq_ / (num_samples_padded_ * 4);
-
- KALDI_VLOG(4) << "Best frequency based on interpolation is "
- << max_freq << ", best energy is "
- << opt_energy << ", bin is " << bin;
-
- OptimizedInfo final_info;
-
- FineOptimizeFrequency(signal, bin, offset, &info, &final_info);
-
- // the following while loop will rarely be accessed.
- while (final_info.offset == 0.0 && bin > 0) {
- bin--;
- FineOptimizeFrequency(signal, bin, 1.0, &info, &final_info);
- }
-
- // the following while loop will rarely be accessed.
- while (final_info.offset == 1.0 && bin < num_samples_padded_ * 2) {
- bin++;
- FineOptimizeFrequency(signal, bin, 0.0, &info, &final_info);
- }
-
- if (bin <= 1 || bin >= num_samples_padded_ * 2 - 2) {
- // If we're in the lowest or next-to-lowest bin, or the highest or
- // next-to-highest allowed bin (note, "bin" here is a range, and it can
- // never have the value num_samples_padded_ * 2), we tend to get more
- // estimation error than usual, so do another round of optimization.
- FineOptimizeFrequency(signal, bin, final_info.offset, &info, &final_info);
- }
-
- BaseFloat final_freq = (final_info.bin + final_info.offset) * samp_freq_ / (num_samples_padded_ * 4);
- KALDI_VLOG(4) << "Final optimized info is: freq " << final_freq
- << ", cos coeff " << final_info.cos_coeff << ", sin coeff "
- << final_info.sin_coeff << ", energy " << final_info.energy;
-
- if (GetVerboseLevel() > 1)
- SelfTest(signal, info, final_freq, final_info.energy);
-
- if (final_info.energy >= min_energy) {
- sinusoid->amplitude = std::sqrt(final_info.cos_coeff * final_info.cos_coeff
- + final_info.sin_coeff * final_info.sin_coeff);
- sinusoid->freq = final_freq;
- sinusoid->phase = -std::atan2(final_info.sin_coeff, final_info.cos_coeff);
- KALDI_VLOG(4) << "Phase is " << sinusoid->phase << ", amplitude is "
- << sinusoid->amplitude << ", freq is " << sinusoid->freq;
- return final_info.energy;
- } else {
- return 0.0;
- }
-}
-
-
-/*
- This function computes, the original FFT bins, the amount of energy in
- the signal that can be explained by a sinusoid at the corresponding frequency.
-
- Let f be the continuous-valued frequency.
-
- Define the vector C_f as
- C_f = [ c_0, c_1 ... c_n ] where c_k = cos(2 pi k f / samp_freq). [obviously this notation depends on f].
- and S_f the same thing with sin in place of cos.
-
- Let the signal, as a vector, be V.
- We want to maximize the (positive) energy-difference:
- ||V||^2 - || V - c C_f - s S_f ||^2
- where c and s are the coefficients of C_f and S_f.
- This quantity can be expanded as follows, where . means dot product.
- \delta E = -c^2 C_f.C_f - s^2 S_f.S_f - 2 c s C_f.S_f + 2 c V.C_f + 2 s V.S_f.
- which can be written as follows, where . means dot-product and ' means transpose:
- \delta E = 2 [c s] v - [c s] M [c s]'
- where M = [ C_f.C_f, C_f.S_f, C_f.S_f, S_f.S_f ],
- and v = [V.C_f, V.S_f].
- If M is invertible (i.e. for nonzero frequencies), this is maximized by
- [c s] = M^-1 v
- giving us the value.
- \delta E = v' M^{-1} v.
- We'll compute the inverse of M in advance, inside ComputeCoefficients(), using
- the formula [a b;c d]^-1 = 1/(ad - bc) [d -b; -c a] For zero frequency and at the
- Nyquist, M has the value [ a 0; 0 0 ], and we have the same type of expression
- limited to the first dim of v, i.e. Minv = [ a^{-1} 0; 0 0 ], a kind of pseudo-inverse.
- */
-
-void SinusoidDetector::ComputeCoarseInfo(
- const Vector<BaseFloat> &fft,
- std::vector<InfoForBin> *info) const {
- info->resize(num_samples_padded_ * 2 + 1); // 4 times resolution of FFT itself.
-
- const BaseFloat *fft_data = fft.Data();
-
- int32 num_bins = num_samples_padded_ / 2 + 1;
- for (int32 k = 0; k < num_bins; k++) {
- BaseFloat real, im;
- if (k == 0) {
- real = fft_data[0];
- im = 0.0;
- } else if (k == num_samples_padded_ / 2) {
- real = fft_data[1];
- im = 0.0;
- } else {
- real = fft_data[k * 2];
- im = fft_data[k * 2 + 1];
- }
- // v1 and v2 are the two components of the vector v in the math above.
- BaseFloat v1 = real, v2 = -im;
- // Minv_'s row indexes correspond to frequencies with 4 times more
- // resolution than the FFT bins.
- const BaseFloat *Minv_data = Minv_.RowData(k * 4);
- // The Matrix M^{-1} is of the form [a b; b d]
- BaseFloat a = Minv_data[0], b = Minv_data[1], d = Minv_data[2];
- // compute \delta E = v' M^{-1} v.
- BaseFloat delta_e = v1 * v1 * a + v2 * v2 * d + 2 * v1 * v2 * b;
- InfoForBin &this_info = (*info)[k * 4];
- this_info.valid = true;
- this_info.cos_dot = real;
- this_info.sin_dot = -im;
- this_info.energy = delta_e;
- }
-}
-
-
-void SinusoidDetector::ComputeCoefficients() {
- int32 num_samp = num_samples_;
- int32 num_freq = num_samples_padded_ * 2 + 1;
- cos_.Resize(num_freq, num_samp);
- sin_.Resize(num_freq, num_samp);
-
- Vector<BaseFloat> cc(num_freq), cs(num_freq);
- for (int32 k = 0; k < num_freq; k++) {
- BaseFloat freq = k * samp_freq_ / (num_samples_padded_ * 4);
- SubVector<BaseFloat> c(cos_, k), s(sin_, k);
- CreateCosAndSin(samp_freq_, freq, &c, &s);
- cc(k) = VecVec(c, c);
- cs(k) = VecVec(c, s);
- }
-
- M_.Resize(num_freq, 3, kUndefined);
- Minv_.Resize(num_freq, 3, kUndefined);
-
- for (int32 k = 0; k < num_freq; k++) {
- // Let the matrix M be [ a b; b d ]. [we don't write c because c == b].
- // We want to compute Minv_.
- BaseFloat a = cc(k), b = cs(k), d = num_samples_ - a;
- M_(k, 0) = a;
- M_(k, 1) = b;
- M_(k, 2) = d;
- if (k == 0 || k == num_freq - 1) {
- // this is a special case; it's not really the inverse of M but it will
-
- // give us the expression we want; it's like an inverse in just one dimension.
- Minv_(k, 0) = 1.0 / a;
- Minv_(k, 1) = 0.0;
- Minv_(k, 2) = 0.0;
- } else {
- BaseFloat inv_det = 1.0 / (a * d - b * b);
- // check for NaN and inf.
- KALDI_ASSERT(inv_det == inv_det && inv_det - inv_det == 0.0);
- // use: [a b;c d]^-1 = 1/(ad - bc) [d -b; -c a], special case where c = b.
- BaseFloat inv_a = d * inv_det, inv_b = -b * inv_det, inv_d = a * inv_det;
- Minv_(k, 0) = inv_a;
- Minv_(k, 1) = inv_b;
- Minv_(k, 2) = inv_d;
- }
- }
-}
-
-
-// Does fine optimization of the frequency within this bin; returns the
-// final energy, the optimized frequency, and the cos and sin coefficients.
-void SinusoidDetector::FineOptimizeFrequency(
- const VectorBase<BaseFloat> &signal,
- int32 bin,
- BaseFloat bin_offset,
- std::vector<InfoForBin> *info_in,
- OptimizedInfo *opt_info) const {
- std::vector<InfoForBin> &info = *info_in;
- if (!info[bin].valid) ComputeBinInfo(signal, bin, &(info[bin]));
- if (!info[bin+1].valid) ComputeBinInfo(signal, bin+1, &(info[bin+1]));
-
- const BaseFloat epsilon = 0.02, delta = 0.001;
-
- // If the offset is very close to the edges of the bin, move it
- // closer to the center. Otherwise we may have problems with the
- // steps below. The initial offset is only used as a starting point
- // anyway, so this won't affect the final value much.
- if (bin_offset < epsilon)
- bin_offset = epsilon;
- if (bin_offset > 1.0 - epsilon)
- bin_offset = 1.0 - epsilon;
- KALDI_VLOG(4) << "Initial bin offset = " << bin_offset << ", bin = " << bin;
-
- // create cos and sin waves of the specified frequency.
- BaseFloat freq = (bin + bin_offset) * samp_freq_ / (num_samples_padded_ * 4);
- Vector<BaseFloat> c(num_samples_, kUndefined), s(num_samples_, kUndefined);
- CreateCosAndSin(samp_freq_, freq, &c, &s);
-
- // these a, b and d values are the elements of the M matrix at this frequency
- // "freq", i.e. the matrix M_f [ a b; b d ]. This will be invertible because
- // we have ensured that the frequency is not too close to zero or the Nyquist.
- BaseFloat a = VecVec(c, c), b = VecVec(c, s), d = num_samples_ - a;
- BaseFloat inv_det = 1.0 / (a * d - b * b);
- BaseFloat inv_a = d * inv_det, inv_b = -b * inv_det, inv_d = a * inv_det;
-
-
- BaseFloat v1 = VecVec(c, signal), v2 = VecVec(s, signal);
-
- BaseFloat delta_e = v1 * v1 * inv_a + v2 * v2 * inv_d + 2 * v1 * v2 * inv_b;
-
- KALDI_VLOG(4) << "Actual energy-change at frequency " << freq << " is "
- << delta_e;
- // "freq" is frequency somewhere in the middle of the bin.
-
- BaseFloat final_offset, final_energy;
- QuadraticMaximize(bin_offset, info[bin].energy, delta_e, info[bin+1].energy,
- &final_offset, &final_energy);
-
- KALDI_VLOG(4) << "After further optimizing, offset was " << final_offset
- << " giving freq "
- << ((bin+final_offset) * samp_freq_ / (num_samples_padded_*4))
- << ", with energy " << final_energy;
-
- // Use interpolation (using a quadratic function) to get the entries of the M matrix
- // the the final, tuned frequency. Interpolation on M is better than M^{-1}, as its
- // elements are much better behaved as the frequency varies.
- const BaseFloat *M_left_data = M_.RowData(bin),
- *M_right_data = M_.RowData(bin + 1);
-
- BaseFloat a_interp = QuadraticInterpolate(bin_offset, M_left_data[0], a, M_right_data[0],
- final_offset);
- BaseFloat b_interp = QuadraticInterpolate(bin_offset, M_left_data[1], b, M_right_data[1],
- final_offset);
- BaseFloat d_interp = QuadraticInterpolate(bin_offset, M_left_data[2], d, M_right_data[2],
- final_offset);
-
- // Now get the inverse of the M matrix at the final point.
- BaseFloat a_inv_interp, b_inv_interp, d_inv_interp;
-
- if ((bin == 0 && final_offset < delta) ||
- (bin == num_samples_padded_ * 2 && final_offset > 1.0 - delta)) {
- // If we're extremely close to zero or the Nyquist, we'll have trouble
- // inverting M; just invert in the 1st dimension (only have a cos
- // component).
- a_inv_interp = 1.0 / a_interp;
- b_inv_interp = 0.0;
- d_inv_interp = 0.0;
- } else {
- BaseFloat inv_det = 1.0 / (a_interp * d_interp - b_interp * b_interp);
- // check for NaN and inf.
- KALDI_ASSERT(inv_det == inv_det && inv_det - inv_det == 0.0);
- // use: [a b;c d]^-1 = 1/(ad - bc) [d -b; -c a], special case where c = b.
- a_inv_interp = d_interp * inv_det;
- b_inv_interp = -b_interp * inv_det;
- d_inv_interp = a_interp * inv_det;
- }
-
- BaseFloat v1_interp = QuadraticInterpolate(bin_offset, info[bin].cos_dot, v1,
- info[bin+1].cos_dot, final_offset);
- BaseFloat v2_interp = QuadraticInterpolate(bin_offset, info[bin].sin_dot, v2,
- info[bin+1].sin_dot, final_offset);
-
- opt_info->bin = bin;
- opt_info->offset = final_offset;
- // Recompute the energy-reduction using the more accurate interpolated values of
- // v1 and v2 (the dot-products of the cos and sin with the signal), and
- // of M.
- opt_info->energy = v1_interp * v1_interp * a_inv_interp +
- v2_interp * v2_interp * d_inv_interp +
- 2 * v1_interp * v2_interp * b_inv_interp;
- // Compute the coefficients of the cos and sin in the optimal sinusoid, as
- // M^{-1} v.
- opt_info->cos_coeff = a_inv_interp * v1_interp + b_inv_interp * v2_interp;
- opt_info->sin_coeff = b_inv_interp * v1_interp + d_inv_interp * v2_interp;
-}
-
-void SinusoidDetector::FindCandidateBins(
- BaseFloat min_energy,
- const std::vector<InfoForBin> &info,
- std::vector<int32> *bins) const {
-
- int32 max_bin = num_samples_padded_ * 2;
-
- BaseFloat cutoff = min_energy;
- for (int32 k = 0; k <= max_bin; k += 4) {
- KALDI_ASSERT(info[k].valid);
- cutoff = std::max(cutoff, info[k].energy);
- }
-
- for (int32 k = 0; k < max_bin; k += 4) {
- BaseFloat energy_upper_bound =
- factor1_ * std::max(info[k].energy,
- info[k+4].energy);
- if (energy_upper_bound >= cutoff)
- bins->push_back(k + 2);
- }
-}
-
-
-void SinusoidDetector::FindCandidateBins2(
- BaseFloat min_energy,
- const std::vector<InfoForBin> &info,
- std::vector<int32> *bins2) const {
-
- int32 max_bin = num_samples_padded_ * 2;
-
- BaseFloat cutoff = min_energy;
- for (int32 k = 0; k <= max_bin; k += 2) {
- if (info[k].valid)
- cutoff = std::max(cutoff, info[k].energy);
- }
-
- for (int32 k = 0; k < max_bin; k += 2) {
- if (info[k].valid && info[k+2].valid) {
- BaseFloat energy_upper_bound =
- factor2_ * std::max(info[k].energy,
- info[k+2].energy);
- if (energy_upper_bound >= cutoff)
- bins2->push_back(k + 1);
- }
- }
-}
-
-
-void SinusoidDetector::ComputeBinInfo(
- const VectorBase<BaseFloat> &signal,
- int32 bin,
- InfoForBin *info) const {
- KALDI_ASSERT(!info->valid); // or wasted time.
- info->valid = true;
- BaseFloat v1 = info->cos_dot = VecVec(cos_.Row(bin), signal);
- BaseFloat v2 = info->sin_dot = VecVec(sin_.Row(bin), signal);
- const BaseFloat *Minv_data = Minv_.RowData(bin);
- BaseFloat a = Minv_data[0], b = Minv_data[1], d = Minv_data[2];
- // compute \delta E = v' M^{-1} v.
- BaseFloat delta_e = v1 * v1 * a + v2 * v2 * d + 2 * v1 * v2 * b;
- info->energy = delta_e;
-}
-
-
-MultiSinusoidDetector::MultiSinusoidDetector(
- const MultiSinusoidDetectorConfig &config,
- int32 sampling_freq):
- config_(config),
- sample_freq_(sampling_freq),
- samples_per_frame_subsampled_(0.001 * config.frame_length_ms *
- static_cast<BaseFloat>(config.subsample_freq)),
- waveform_finished_(false),
- samples_consumed_(0),
- resampler_(sampling_freq, config.subsample_freq,
- config.subsample_filter_cutoff, config.subsample_filter_zeros),
- detector_(config.subsample_freq, samples_per_frame_subsampled_) {
- config.Check();
-}
-
-
-void MultiSinusoidDetector::Reset() {
- waveform_finished_ = false;
- samples_consumed_ = 0;
- while(!subsampled_signal_.empty()) {
- delete subsampled_signal_.front();
- subsampled_signal_.pop_front();
- }
- resampler_.Reset();
-}
-
-void MultiSinusoidDetector::WaveformFinished() {
- KALDI_ASSERT(!waveform_finished_ &&
- "WaveformFinished() called twice.");
-
- Vector<BaseFloat> empty_waveform;
- subsampled_signal_.push_back(new Vector<BaseFloat>());
- bool flush = true;
- resampler_.Resample(empty_waveform, flush,
- subsampled_signal_.back());
- waveform_finished_ = true;
- if (subsampled_signal_.back()->Dim() == 0) {
- delete subsampled_signal_.back();
- subsampled_signal_.pop_back();
- }
-}
-
-void MultiSinusoidDetector::AcceptWaveform(
- const VectorBase<BaseFloat> &waveform) {
-
-
- subsampled_signal_.push_back(new Vector<BaseFloat>());
- bool flush = false;
- resampler_.Resample(waveform, flush,
- subsampled_signal_.back());
- if (subsampled_signal_.back()->Dim() == 0) {
- delete subsampled_signal_.back();
- subsampled_signal_.pop_back();
- }
-}
-
-int32 MultiSinusoidDetector::NumSubsampledSamplesReady(int32 max_samp) const {
- KALDI_ASSERT(samples_consumed_ >= 0 &&
- ((subsampled_signal_.empty() && samples_consumed_ == 0) ||
- (!subsampled_signal_.empty () && samples_consumed_ <
- subsampled_signal_[0]->Dim())));
-
- int32 ans = -samples_consumed_;
- for (size_t i = 0; i < subsampled_signal_.size(); i++) {
- ans += subsampled_signal_[i]->Dim();
- if (ans > max_samp) break;
- }
- KALDI_ASSERT(ans >= 0);
- return std::min(ans, max_samp);
-}
-
-bool MultiSinusoidDetector::Done() const {
- int32 samp_ready = NumSubsampledSamplesReady(samples_per_frame_subsampled_);
- if ((samp_ready >= samples_per_frame_subsampled_ && !waveform_finished_) ||
- (samp_ready > 0 && waveform_finished_))
- return false;
- else
- return true;
-}
-
-void MultiSinusoidDetector::GetNextFrameOfSignal(Vector<BaseFloat> *frame) {
- frame->Resize(samples_per_frame_subsampled_, kUndefined);
-
- int32 sample_offset = 0,
- samples_needed = samples_per_frame_subsampled_;
- while (samples_needed > 0 &&
- !subsampled_signal_.empty()) {
- Vector<BaseFloat> *src = subsampled_signal_.front();
- int32 num_samples_avail = src->Dim() - samples_consumed_;
- KALDI_ASSERT(num_samples_avail > 0);
- int32 chunk_size = std::min(num_samples_avail,
- samples_needed);
- frame->Range(sample_offset, chunk_size).CopyFromVec(
- src->Range(samples_consumed_, chunk_size));
- sample_offset += chunk_size;
- samples_needed -= chunk_size;
- samples_consumed_ += chunk_size;
- if (samples_consumed_ == src->Dim()) {
- samples_consumed_ = 0;
- delete src;
- subsampled_signal_.pop_front();
- }
- }
- if (samples_needed > 0) {
- KALDI_ASSERT(waveform_finished_ && sample_offset > 0); // or code error.
- frame->Range(sample_offset, samples_needed).SetZero();
- }
-}
-
-
-void MultiSinusoidDetector::GetNextFrame(MultiSinusoidDetectorOutput *output) {
- Vector<BaseFloat> frame;
- GetNextFrameOfSignal(&frame);
- // Mean subtraction
- frame.Add(-1.0 * frame.Sum() / frame.Dim());
- *output = MultiSinusoidDetectorOutput(); // reset to default.
-
- BaseFloat signal_energy = VecVec(frame, frame);
- output->tot_energy = signal_energy / frame.Dim();
- if (signal_energy == 0.0) return;
-
- // min_energy1 is the lowest energy we might care about.
- BaseFloat min_energy1 = signal_energy *
- std::min<BaseFloat>(config_.two_freq_min_total_energy * 0.5,
- config_.one_freq_min_energy);
-
- Sinusoid sinusoid1;
- BaseFloat energy1 = detector_.DetectSinusoid(min_energy1,
- frame,
- &sinusoid1);
-
- if (energy1 == 0.0) return; // Nothing detected.
-
- // we only care about the 2nd sinusoid if
- // energy1 + energy2 >= signal_energy * two_freq_min_total_energy,
- // and energy2 >= signal_energy * config.two_freq_min_energy.
-
- BaseFloat min_energy2 =
- std::max(signal_energy * config_.two_freq_min_energy,
- signal_energy * config_.two_freq_min_total_energy
- - energy1);
-
- BaseFloat energy2;
- Sinusoid sinusoid2;
-
- // If there is enough energy left in the signal that we could
- // possibly detect a sinusoid of energy at least min_energy2...
- if (min_energy2 <= signal_energy - energy1) {
- sinusoid1.phase += M_PI; // reverse the phase.
- AddSinusoid(config_.subsample_freq, sinusoid1, &frame);
-
-
- energy2 = detector_.DetectSinusoid(min_energy2,
- frame,
- &sinusoid2);
-
- if (energy2 > energy1) {
- // The following is just for our information, so we are aware
- // when the sinusoid detection gives us the non-optimal sinusoid
- // first.
- BaseFloat factor = energy2 / energy1;
- KALDI_VLOG(2) << "Second sinusoid greater than first by a factor of "
- << factor << ". (This means sinusoid detection is not "
- << " working ideally).";
- }
-
- if (DetectedTwoFrequency(signal_energy,
- sinusoid1, energy1,
- sinusoid2, energy2,
- output))
- return;
- } else {
- energy2 = 0.0;
- }
- // We don't need the return status of the following; we just return anyway.
- DetectedOneFrequency(signal_energy,
- sinusoid1, energy1,
- sinusoid2, energy2,
- output);
-}
-
-// acceptable two-frequency tone.
-bool MultiSinusoidDetector::DetectedTwoFrequency(
- BaseFloat signal_energy,
- const Sinusoid &sinusoid1,
- BaseFloat energy1,
- const Sinusoid &sinusoid2,
- BaseFloat energy2,
- MultiSinusoidDetectorOutput *output) {
-
- if (energy1 + energy2 >= signal_energy *
- config_.two_freq_min_total_energy &&
- std::min(energy1, energy2) >= signal_energy *
- config_.two_freq_min_energy &&
- std::min(sinusoid1.freq, sinusoid2.freq) >= config_.min_freq &&
- std::max(sinusoid1.freq, sinusoid2.freq) <= config_.max_freq) {
- output->freq1 = sinusoid1.freq;
- output->energy1 = energy1 / signal_energy;
- output->freq2 = sinusoid2.freq;
- output->energy2 = energy2 / signal_energy;
- if (output->freq1 > output->freq2) {
- std::swap(output->freq1, output->freq2);
- std::swap(output->energy1, output->energy2);
- }
- return true;
- } else {
- return false;
- }
-}
-
-
-// acceptable two-frequency tone.
-bool MultiSinusoidDetector::DetectedOneFrequency(
- BaseFloat signal_energy,
- const Sinusoid &sinusoid1,
- BaseFloat energy1,
- const Sinusoid &sinusoid2,
- BaseFloat energy2,
- MultiSinusoidDetectorOutput *output) {
- // If sinusoid detection were performing exactly to spec, we could assume
- // energy1 >= energy2, but we don't assume this as it's not guaranteed.
- if (energy1 > energy2 && energy1 > signal_energy *
- config_.one_freq_min_energy &&
- sinusoid1.freq >= config_.min_freq &&
- sinusoid1.freq <= config_.max_freq) {
- output->freq1 = sinusoid1.freq;
- output->energy1 = energy1 / signal_energy;
- output->freq2 = 0.0;
- output->energy2 = 0.0;
- return true;
- } else if (energy2 > energy1 && energy2 > signal_energy *
- config_.one_freq_min_energy &&
- sinusoid2.freq >= config_.min_freq &&
- sinusoid2.freq <= config_.max_freq) {
- output->freq1 = sinusoid2.freq;
- output->energy1 = energy2 / signal_energy;
- output->freq2 = 0.0;
- output->energy2 = 0.0;
- return true;
- } else {
- return false;
- }
-}
-
-
-void DetectSinusoids(const VectorBase<BaseFloat> &signal,
- MultiSinusoidDetector *detector,
- Matrix<BaseFloat> *output) {
- std::vector<MultiSinusoidDetectorOutput> output_vec;
- detector->AcceptWaveform(signal);
- detector->WaveformFinished();
-
- int32 safety_margin = 10, approx_num_frames = safety_margin +
- (signal.Dim() / (detector->SamplingFrequency() *
- detector->FrameShiftSecs()));
- output_vec.reserve(approx_num_frames);
- while (!detector->Done()) {
- output_vec.resize(output_vec.size() + 1);
- detector->GetNextFrame(&(output_vec.back()));
- }
- detector->Reset();
- if (output_vec.empty()) {
- output->Resize(0, 0);
- } else {
- output->Resize(output_vec.size(), 5, kUndefined);
- for (int32 i = 0; i < output->NumRows(); i++) {
- BaseFloat *row_data = output->RowData(i);
- MultiSinusoidDetectorOutput &this_output = output_vec[i];
- row_data[0] = this_output.tot_energy;
- row_data[1] = this_output.freq1;
- row_data[2] = this_output.energy1;
- row_data[3] = this_output.freq2;
- row_data[4] = this_output.energy2;
- }
- }
-}
-
-
-} // namespace kaldi
-
diff --git a/src/feat/sinusoid-detection.h b/src/feat/sinusoid-detection.h
+++ /dev/null
@@ -1,436 +0,0 @@
-// feat/sinusoid-detection.h
-
-// Copyright 2015 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_FEAT_SINUSOID_DETECTION_H_
-#define KALDI_FEAT_SINUSOID_DETECTION_H_
-
-
-#include "base/kaldi-error.h"
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "feat/resample.h"
-#include <deque>
-
-namespace kaldi {
-/// @addtogroup feat FeatureExtraction
-/// @{
-
-
-struct Sinusoid {
- // this structure used to represent a sinusoid of type amplitude cos (2 pi
- // freq t + phase), in the SinusoidDetector code.
- BaseFloat amplitude;
- BaseFloat freq;
- BaseFloat phase;
- Sinusoid(BaseFloat a, BaseFloat f, BaseFloat p):
- amplitude(a), freq(f), phase(p) { }
- Sinusoid() {}
-};
-
-
-// This function adds the given sinusoid to the signal, as:
-// (*signal)(t) += amplitude * cos(2 pi freq/samp_freq t + phase).
-void AddSinusoid(BaseFloat samp_freq,
- const Sinusoid &sinusoid,
- VectorBase<BaseFloat> *signal);
-
-
-class SinusoidDetector {
- public:
- SinusoidDetector(BaseFloat samp_freq,
- int32 num_samp);
-
-
- // Detect the dominant sinusoid component in the signal, as long as the
- // energy-reduction of the signal from subtracting that sinuoid would be >=
- // "min_energy_change", and return that energy reduction; or zero if no
- // candidate was found.
- // non-const because the FFT class has a temporary buffer.
- BaseFloat DetectSinusoid(BaseFloat min_energy_change,
- const VectorBase<BaseFloat> &signal,
- Sinusoid *sinusoid);
-
- // This function does quadratic interpolation for a function that is known at
- // three equally spaced points [x0 x1 x2] = [0 1 2], and we want the x-value
- // and corresponding y-value at the maximum of the function within the range
- // 0 <= x <= 2. It's public for testing reasons.
- static void QuadraticMaximizeEqualSpaced(
- BaseFloat y0, BaseFloat y1, BaseFloat y2,
- BaseFloat *x, BaseFloat *y);
-
-
- // This function does quadratic interpolation for a function that is known at
- // three points x0, x1 and x2 with x0 = 0, 0 < x1 < 1 and x2 = 1, where we
- // want the x-value and corresponding y-value at the maximum of the function
- // within the range 0 <= x <= 1. It's public for testing reasons.
- static void QuadraticMaximize(
- BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2,
- BaseFloat *x, BaseFloat *y);
-
- // This function does quadratic interpolation for a function that is known at
- // three points x0, x1 and x2 with x0 = 0, 0 <= x1 <= 1 and x2 = 1, where
- // we want the value at a specific value x. The corresponding y-value is returned.
- static BaseFloat QuadraticInterpolate(
- BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2,
- BaseFloat x);
-
-
- private:
- BaseFloat samp_freq_;
- int32 num_samples_;
- int32 num_samples_padded_; // Number of samples, after zero-padding to power of 2.
- SplitRadixRealFft<BaseFloat> fft_; // Object used to compute FFT of padded_signal_.
-
- BaseFloat factor1_; // When we search the range between two FFT bins, we
- // assume that the maximum energy-reduction within the
- // range may be greater than the maximum of the
- // energy-reductions at either side, by at most
- // "factor1", with factor1 > 1.0. The analysis is quite
- // hard so we determine this factor empirically. Making
- // this as small as possible helps us avoid searching too
- // many bins.
-
- BaseFloat factor2_; // As factor1, but for searches within a half-fft-bin
- // range. Again determined empirically. After that we
- // use quadratic interpolation to find the maximum energy.
-
- // This matrix, of dimension (num_samples_padded_ * 2 + 1) by
- // num_samples_, has in each row, a different frequency of cosine wave.
- Matrix<BaseFloat> cos_;
- // This matrix, of dimension (num_samples_padded_ * 2 + 1) by
- // num_samples_, has in each row, a different frequency of sine wave.
- Matrix<BaseFloat> sin_;
-
- // M_ is a precomputed matrix of dimension (num_samples_padded_ * 2 + 1) by 3,
- // containing the values x y z of a symmetric matrix [ a b; b c ]. There is
- // one of these matrices for each frequency, sampled at one quarter the
- // spacing of the FFT bins. There is a long comment next to the definition of
- // ComputeCoefficients that describes this.
- Matrix<BaseFloat> M_;
-
- // Minv_ is the coefficients in the same format as M_, but containing the
- // corresponding coefficients of the inverse matrix. There is a long comment
- // next to the definition of ComputeCoefficients that describes this.
- Matrix<BaseFloat> Minv_;
-
-
- struct InfoForBin {
- bool valid;
- BaseFloat cos_dot; // dot product of signal with cosine on left frequency
- BaseFloat sin_dot; // dot product of signal with sine on left frequency
- BaseFloat energy; // energy.
- InfoForBin(): valid(false) { }
- };
-
- // Info after fine optimization within a bin.
- struct OptimizedInfo {
- int32 bin;
- BaseFloat offset;
- BaseFloat energy;
- BaseFloat cos_coeff;
- BaseFloat sin_coeff;
- };
-
- // Compute the coefficients and energies at the original FFT bins (every
- // fourth entry in "info").
- void ComputeCoarseInfo(const Vector<BaseFloat> &fft,
- std::vector<InfoForBin> *info) const;
-
-
- // After the coarse-level info is computed using ComputeCoarseInfo, finds a
- // set of intermediate bin indexes to compute, that are the midpoints of
- // coarse-level bins.
- void FindCandidateBins(BaseFloat min_energy,
- const std::vector<InfoForBin> &info,
- std::vector<int32> *bins) const;
-
- void FindCandidateBins2(BaseFloat min_energy,
- const std::vector<InfoForBin> &info,
- std::vector<int32> *bins) const;
-
-
- void ComputeBinInfo(const VectorBase<BaseFloat> &signal,
- int32 bin, InfoForBin *info) const;
-
-
- // For each bin b such that we have valid "info" data for bins b, b+1 and b+2,
- // does quadratic interpolation to find the maximum predicted energy in the
- // range [b, b+2]. The location of the maximum predicted energy is output to
- // "bin_out" and "offset_out", and the corresponding predicted energy is
- // returned.
- //
- // Note: if there are two different frequencies with similar maximum energies
- // (e.g. within a factor of probably around 1.2 or so), the fact that
- // OptimizeFrequency only returns one maximum may potentially lead to the
- // smaller maximum being output. We could have modified this to output
- // multiple different maxima, which could have been more accurate in terms of
- // being guaranteed to output the best maximum, but this probably wouldn't
- // have a measurable impact on our application so we haven't bothered.
- BaseFloat OptimizeFrequency(
- const std::vector<InfoForBin> &info,
- int32 *bin_out,
- BaseFloat *offset_out) const;
-
-
- // This function does
- // (*cos)(t) = cos(2 pi t freq / samp_freq)
- // (*sin)(t) = sin(2 pi t freq / samp_freq)
- static void CreateCosAndSin(BaseFloat samp_freq,
- BaseFloat freq,
- VectorBase<BaseFloat> *cos,
- VectorBase<BaseFloat> *sin);
-
- // Do fine optimization of the frequency within a bin, given a reasonable
- // approximate position within it based on interpolation (that should be close
- // to the optimum).
- void FineOptimizeFrequency(
- const VectorBase<BaseFloat> &signal,
- int32 bin,
- BaseFloat offset,
- std::vector<InfoForBin> *info,
- OptimizedInfo *opt_info) const;
-
- // Computes the coefficients cos_, sin_, and Minv_.
- void ComputeCoefficients();
-
- // Calls some self-testing code that prints warnings if
- // some of our assumptions were wrong.
- void SelfTest(const VectorBase<BaseFloat> &signal,
- const std::vector<InfoForBin> &info,
- BaseFloat final_freq,
- BaseFloat final_energy);
-
-};
-
-
-
-/**
- This configuration class is for the frame-by-frame detection of
- cases where there are one or two sinusoids that can explain
- a lot of the energy in the signal.
-*/
-struct MultiSinusoidDetectorConfig {
-
- // frame length in milliseconds
- BaseFloat frame_length_ms;
- // frame shift in milliseconds
- BaseFloat frame_shift_ms;
-
- // Proportion of the total energy of the signal that the quieter of
- // the two sinusoids must comprise, in order to be counted, if two
- // sinusoids are detected.
- BaseFloat two_freq_min_energy;
-
- // Proportion of the total energy of the signal that both sinusoids (if
- // two are detected) must comprise, in order to be output.
- BaseFloat two_freq_min_total_energy;
-
- // Proportion of the total energy of the signal that a single sinusoid
- // must comprise, in order to be output, if we are considering
- // reporting a single sinusoid. Note: detection of two sinusoids
- // will take precedence over detection of a single sinusoid.
- BaseFloat one_freq_min_energy;
-
- // Lower end of frequency range that we consider; frequencies outside
- // this range are not candidates to appear in the detected output.
- BaseFloat min_freq;
- // Upper end of frequency range that we consider, see min_freq.
- BaseFloat max_freq;
-
- // Frequency to which we subsample the signal before processing it.
- // Must be integer because of how LinearResample code works.
- int32 subsample_freq;
-
- // Filter cut-off frequency used in sub-sampling.
- BaseFloat subsample_filter_cutoff;
-
- // the following is not critical and is not exported to the
- // command line.
- int32 subsample_filter_zeros;
-
- MultiSinusoidDetectorConfig():
- frame_length_ms(20), frame_shift_ms(10),
- two_freq_min_energy(0.2), two_freq_min_total_energy(0.6),
- one_freq_min_energy(0.75), min_freq(300.0),
- max_freq(1800.0), subsample_freq(4000),
- subsample_filter_cutoff(1900.0), subsample_filter_zeros(5) {}
-
- void Register(OptionsItf *opts) {
- opts->Register("frame-length", &frame_length_ms,
- "Frame length in milliseconds");
- opts->Register("frame-shift", &frame_shift_ms,
- "Frame shift in milliseconds");
- opts->Register("two-freq-min-energy", &two_freq_min_energy,
- "For detecting two-frequency tones, minimum energy that "
- "the quieter frequency must have (relative to total "
- "enegy of frame)");
- opts->Register("two-freq-min-total-energy", &two_freq_min_total_energy,
- "For detecting two-frequency tones, minimum energy that "
- "the two frequencies together must have (relative to total "
- "energy of frame)");
- opts->Register("one-freq-min-energy", &one_freq_min_energy, "For detecting "
- "single-frequency tones, minimum energy that the frequency "
- "must have relative to total energy of frame");
- opts->Register("min-freq", &min_freq, "Minimum frequency of sinusoid that "
- "will be detected");
- opts->Register("max-freq", &max_freq, "Maximum frequency of sinusoid that "
- "will be detected");
- opts->Register("subsample-freq", &subsample_freq, "Frequency at which "
- "we subsample the signal");
- opts->Register("subsample-filter-cutoff", &subsample_filter_cutoff, "Filter "
- "cut-off frequency used in subsampling");
- }
- void Check() const {
- KALDI_ASSERT(frame_length_ms > 0 && frame_length_ms >= frame_shift_ms &&
- min_freq > 0 && max_freq > min_freq &&
- subsample_filter_cutoff > max_freq &&
- subsample_freq/2 > subsample_filter_cutoff &&
- subsample_filter_zeros > 2 &&
- subsample_filter_cutoff > 0.25 * subsample_freq &&
- two_freq_min_total_energy > two_freq_min_energy &&
- two_freq_min_energy <= 0.5 * two_freq_min_total_energy);
- BaseFloat samples_per_frame_shift =
- frame_shift_ms * 0.001 * subsample_freq;
- // The following assert ensures that the frame-shift is an exact
- // number of samples, so that the locations of the frames
- // don't gradually drift out of sync.
- KALDI_ASSERT(fabs(samples_per_frame_shift -
- static_cast<int32>(samples_per_frame_shift)) <
- 0.001);
-
- }
-};
-
-struct MultiSinusoidDetectorOutput {
- BaseFloat tot_energy; // Total energy per sample of this frame (sum-square of
- // signal divided by number of samples... this is after
- // downsampling and mean subtraction.
- BaseFloat freq1; // Lower frequency detected, or 0 if none detected.
- BaseFloat energy1; // Energy of lower frequency divided by total energy, or 0
- // if none detected.
- BaseFloat freq2; // Lower frequency detected, or 0 if zero or one
- // frequencies detected.
- BaseFloat energy2; // Energy of higher frequency divided by total energy, or 0
- // if zero or one freqencies detected.
- MultiSinusoidDetectorOutput(): tot_energy(0.0), freq1(0.0),
- energy1(0.0), freq2(0.0), energy2(0.0) { }
-};
-
-
-class MultiSinusoidDetector {
- public:
-
- // Initialize sinusoid detector. Sampling frequency must be integer.
- MultiSinusoidDetector(const MultiSinusoidDetectorConfig &config,
- int32 sampling_freq);
-
- /// This is how the class acccepts its input. You can put the waveform in
- /// piece by piece, if it's an online application.
- void AcceptWaveform(const VectorBase<BaseFloat> &waveform);
-
- /// The user calls this to announce to the class that the waveform has ended;
- /// this forces any pending data to be flushed.
- void WaveformFinished();
-
- /// Resets the state of the class so you can start processing another waveform.
- void Reset();
-
- /// This returns true if the class currently has no more data ready to output.
- bool Done() const;
-
- /// Outputs the next frame of output to "frame", which must be non-NULL.
- /// It is an error to call this if Done() has returned true, or has not been
- /// checked.
- void GetNextFrame(MultiSinusoidDetectorOutput *output);
-
- BaseFloat FrameShiftSecs() const { return 0.001 * config_.frame_shift_ms; }
-
- BaseFloat SamplingFrequency() const { return sample_freq_; }
-
- private:
- // Gets the next frame of subsampled signal, and consumes the appropriate
- // amount of stored data. It is an error to call this if Done() returned
- // true.
- void GetNextFrameOfSignal(Vector<BaseFloat> *frame);
-
- // returns true and sets freq1, freq1, energy1 and energy2 in "output" if we
- // successfully detected an acceptable two-frequency tone.
- bool DetectedTwoFrequency(BaseFloat signal_energy,
- const Sinusoid &sinusoid1,
- BaseFloat energy1,
- const Sinusoid &sinusoid2,
- BaseFloat energy2,
- MultiSinusoidDetectorOutput *output);
-
- // returns true and sets freq1, freq1, energy1 and energy2 in "output" if we
- // successfully detected an acceptable one-frequency tone.
- bool DetectedOneFrequency(BaseFloat signal_energy,
- const Sinusoid &sinusoid1,
- BaseFloat energy1,
- const Sinusoid &sinusoid2,
- BaseFloat energy2,
- MultiSinusoidDetectorOutput *output);
-
-
- // Returns std::min(max_samp, sum-of-samples-in-subsampled_signal_).
- // (the std::min is for efficiency so we don't have to visit the
- // whole list).
- int32 NumSubsampledSamplesReady(int32 max_samp) const;
-
- MultiSinusoidDetectorConfig config_;
- int32 sample_freq_;
- int32 samples_per_frame_subsampled_; // (samples per frame at subsampled
- // rate).
-
- // True if the user has called WaveformFinished().
- bool waveform_finished_;
-
- // Pieces of the subsampled signal that are awaiting processing.
- // Normally there will be just one element here, but if someone calls
- // AcceptWaveform multiple times before getting output, there could
- // be more elements. All of these pieces are nonempty.
- std::deque<Vector<BaseFloat>* > subsampled_signal_;
-
- // stores the number of samples consumed from the first member of
- // subsampled_signal_. We will always have samples_consumed_ >= 0 and either
- // (subsampled_signal_.empty() && samples_consumed_ == 0) or
- // samples_consumed_ < subsampled_signal_[0]->Dim().
- int32 samples_consumed_;
-
-
- // This object is used to subsample the signal.
- LinearResample resampler_;
-
- // This object is used to detect sinusoids in the subsampled
- // frames.
- SinusoidDetector detector_;
-};
-
-// Detect sinusoids. Signal should be sampled at detector->SamplingFrequency().
-void DetectSinusoids(const VectorBase<BaseFloat> &signal,
- MultiSinusoidDetector *detector,
- Matrix<BaseFloat> *output);
-
-
-
-
-
-/// @} End of "addtogroup feat"
-} // namespace kaldi
-#endif // KALDI_FEAT_SINUSOID_DETECTION_H_
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index dc2bea215d8a7cd5f352f2af5a0642f7c33b709c..c51867b7d4c7f50b23253c457c3e758b7690f964 100644 (file)
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
apply-cmvn-sliding compute-cmvn-stats-two-channel compute-kaldi-pitch-feats \
process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \
compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \
- wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \
- concat-feats append-post-to-feats post-to-feats
+ wav-reverberate append-vector-to-feats shift-feats concat-feats \
+ append-post-to-feats post-to-feats
OBJFILES =
diff --git a/src/featbin/detect-sinusoids.cc b/src/featbin/detect-sinusoids.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-// featbin/detect-sinusoids.cc
-
-// Copyright 2015 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "feat/sinusoid-detection.h"
-#include "feat/wave-reader.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- const char *usage =
- "Detect sinusoids (one or two at a time) in waveform input and output\n"
- "frame-by-frame information on their frequencies and energies. Useful\n"
- "as part of DTMF and dialtone detection. Output is an archive of\n"
- "matrices; for each file, there is a row per frame, containing\n"
- "<signal-energy-per-sample> <frequency1> <energy1> <frequency2> <energy2>\n"
- "where the frequencies and energies may be zero if no sufficiently\n"
- "dominant sinusoid(s) was/were detected. If two frequencies were\n"
- "detected, frequency1 < frequency2. See options for more detail on\n"
- "configuration options.\n"
- "\n"
- "Usage: detect-sinusoids [options] <wav-rspecifier> <matrix-wspecifier>\n"
- "e.g.: detect-sinusoids scp:wav.scp ark,t:sinusoids.ark\n";
-
- ParseOptions po(usage);
- MultiSinusoidDetectorConfig config;
-
- config.Register(&po);
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 2) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string wav_rspecifier = po.GetArg(1),
- matrix_wspecifier = po.GetArg(2);
-
- int32 num_done = 0, num_err = 0;
-
- SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
- BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
-
- MultiSinusoidDetector *detector = NULL;
-
- for (; !wav_reader.Done(); wav_reader.Next()) {
- const WaveData &wav_data = wav_reader.Value();
- const Matrix<BaseFloat> &data = wav_data.Data();
- BaseFloat samp_freq = wav_data.SampFreq();
- int32 num_channels = data.NumRows();
- if (num_channels != 1) {
- KALDI_WARN << "detect-sinusoids requires data with one "
- << "channel. Recording " << wav_reader.Key() << " has "
- << num_channels << ". First select one channel of your "
- << "data (e.g. using sox)";
- num_err++;
- continue;
- }
- if (samp_freq < config.subsample_freq) {
- KALDI_WARN << "Sampling frequency of data " << wav_reader.Key()
- << " is too low " << samp_freq << " < "
- << config.subsample_freq << ". Reduce --subsample-freq "
- << "if you want to run on this data.";
- num_err++;
- continue;
- }
-
- if (detector == NULL ||
- samp_freq != detector->SamplingFrequency()) {
- delete detector;
- detector = new MultiSinusoidDetector(config, samp_freq);
- }
-
- Matrix<BaseFloat> output;
- DetectSinusoids(data.Row(0), detector, &output);
-
- if (output.NumRows() == 0) {
- KALDI_WARN << "No output for " << wav_reader.Key();
- num_err++;
- } else {
- matrix_writer.Write(wav_reader.Key(), output);
- num_done++;
- }
- }
- delete detector;
- KALDI_LOG << "Detected sinusoids in " << num_done << " wave files,"
- << num_err << " with errors.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
diff --git a/src/sgmm/Makefile b/src/sgmm/Makefile
--- a/src/sgmm/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-all:
-
-OPENFST_CXXFLAGS =
-OPENFST_LDLIBS =
-include ../kaldi.mk
-
-TESTFILES = am-sgmm-test estimate-am-sgmm-test fmllr-sgmm-test \
- estimate-am-sgmm-multi-test
-
-OBJFILES = am-sgmm.o estimate-am-sgmm.o fmllr-sgmm.o sgmm-clusterable.o \
- estimate-am-sgmm-ebw.o estimate-am-sgmm-multi.o decodable-am-sgmm.o
-
-LIBNAME = kaldi-sgmm
-ADDLIBS = ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
- ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
- ../matrix/kaldi-matrix.a ../base/kaldi-base.a
-
-include ../makefiles/default_rules.mk
diff --git a/src/sgmm/am-sgmm-test.cc b/src/sgmm/am-sgmm-test.cc
--- a/src/sgmm/am-sgmm-test.cc
+++ /dev/null
@@ -1,278 +0,0 @@
-// sgmm/am-sgmm-test.cc
-
-// Copyright 2012 Arnab Ghoshal
-// Copyright 2009-2011 Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmm/model-test-common.h"
-#include "sgmm/am-sgmm.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm;
-using kaldi::int32;
-using kaldi::BaseFloat;
-namespace ut = kaldi::unittest;
-
-// Tests the initialization routines: InitializeFromFullGmm(), CopyFromSgmm()
-// and CopyGlobalsInitVecs().
-void TestSgmmInit(const AmSgmm &sgmm) {
- using namespace kaldi;
- int32 dim = sgmm.FeatureDim();
- kaldi::SgmmGselectConfig config;
- config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-
- kaldi::Vector<BaseFloat> feat(dim);
- for (int32 d = 0; d < dim; d++) {
- feat(d) = kaldi::RandGauss();
- }
- kaldi::SgmmPerFrameDerivedVars frame_vars;
- frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
- sgmm.PhoneSpaceDim());
-
- std::vector<int32> gselect;
- sgmm.GaussianSelection(config, feat, &gselect);
- SgmmPerSpkDerivedVars empty;
- SgmmPerFrameDerivedVars per_frame;
- sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0);
-
- // First, test the CopyFromSgmm() method:
- AmSgmm *sgmm1 = new AmSgmm();
- sgmm1->CopyFromSgmm(sgmm, true);
- sgmm1->GaussianSelection(config, feat, &gselect);
- sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike1, 1e-4);
- delete sgmm1;
-
- AmSgmm *sgmm2 = new AmSgmm();
- sgmm2->CopyFromSgmm(sgmm, false);
- sgmm2->ComputeNormalizers();
- sgmm2->GaussianSelection(config, feat, &gselect);
- sgmm2->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike2, 1e-4);
- delete sgmm2;
-
- // Next, initialize using the UBM from the current model
- AmSgmm *sgmm3 = new AmSgmm();
- sgmm3->InitializeFromFullGmm(sgmm.full_ubm(), sgmm.NumPdfs(),
- sgmm.PhoneSpaceDim(), sgmm.SpkSpaceDim());
- sgmm3->ComputeNormalizers();
- sgmm3->GaussianSelection(config, feat, &gselect);
- sgmm3->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike3 = sgmm3->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike3, 1e-4);
- delete sgmm3;
-
- // Finally, copy the global parameters from the current model
- AmSgmm *sgmm4 = new AmSgmm();
- sgmm4->CopyGlobalsInitVecs(sgmm, sgmm.PhoneSpaceDim(), sgmm.SpkSpaceDim(),
- sgmm.NumPdfs());
- sgmm4->ComputeNormalizers();
- sgmm4->GaussianSelection(config, feat, &gselect);
- sgmm4->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike4 = sgmm4->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike4, 1e-4);
- delete sgmm4;
-}
-
-// Tests the Read() and Write() methods, in both binary and ASCII mode, as well
-// as Check(), and methods in likelihood computations.
-void TestSgmmIO(const AmSgmm &sgmm) {
- using namespace kaldi;
- int32 dim = sgmm.FeatureDim();
- kaldi::SgmmGselectConfig config;
- config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
-
- kaldi::Vector<BaseFloat> feat(dim);
- for (int32 d = 0; d < dim; d++) {
- feat(d) = kaldi::RandGauss();
- }
- kaldi::SgmmPerFrameDerivedVars frame_vars;
- frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
- sgmm.PhoneSpaceDim());
-
- std::vector<int32> gselect;
- sgmm.GaussianSelection(config, feat, &gselect);
- SgmmPerSpkDerivedVars empty;
- SgmmPerFrameDerivedVars per_frame;
- sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0);
-
- // First, non-binary write
- sgmm.Write(kaldi::Output("tmpf", false).Stream(), false,
- kaldi::kSgmmWriteAll);
-
- bool binary_in;
- AmSgmm *sgmm1 = new AmSgmm();
- // Non-binary read
- kaldi::Input ki1("tmpf", &binary_in);
- sgmm1->Read(ki1.Stream(), binary_in);
- sgmm1->Check(true);
- sgmm1->GaussianSelection(config, feat, &gselect);
- sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
-
- BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike1, 1e-4);
-
- // Next, binary write
- sgmm1->Write(kaldi::Output("tmpfb", true).Stream(), true,
- kaldi::kSgmmWriteAll);
- delete sgmm1;
-
- AmSgmm *sgmm2 = new AmSgmm();
- // Binary read
- kaldi::Input ki2("tmpfb", &binary_in);
- sgmm2->Read(ki2.Stream(), binary_in);
- sgmm2->Check(true);
- sgmm2->GaussianSelection(config, feat, &gselect);
- sgmm2->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike2, 1e-4);
- delete sgmm2;
-
- unlink("tmpf");
- unlink("tmpfb");
-}
-
-void TestSgmmSubstates(const AmSgmm &sgmm) {
- using namespace kaldi;
- int32 target_substates = 2 * sgmm.NumPdfs();
- kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
- for (int32 i = 0; i < occs.Dim(); i++)
- occs(i) = std::fabs(kaldi::RandGauss()) * (kaldi::RandUniform()+1);
- AmSgmm *sgmm1 = new AmSgmm();
- sgmm1->CopyFromSgmm(sgmm, false);
- sgmm1->SplitSubstates(occs, target_substates, 0.01, 0.2, 1000);
- sgmm1->ComputeNormalizers();
- sgmm1->Check(true);
- int32 dim = sgmm.FeatureDim();
- kaldi::SgmmGselectConfig config;
- config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
- kaldi::Vector<BaseFloat> feat(dim);
- for (int32 d = 0; d < dim; d++) {
- feat(d) = kaldi::RandGauss();
- }
-
- std::vector<int32> gselect;
- sgmm.GaussianSelection(config, feat, &gselect);
-
- SgmmPerSpkDerivedVars empty;
- SgmmPerFrameDerivedVars per_frame;
- sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0);
-
- sgmm1->GaussianSelection(config, feat, &gselect);
- sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike1, 1e-2);
-
- delete sgmm1;
-}
-
-void TestSgmmIncreaseDim(const AmSgmm &sgmm) {
- using namespace kaldi;
- int32 target_phn_dim = static_cast<int32>(1.5 * sgmm.PhoneSpaceDim());
- int32 target_spk_dim = sgmm.PhoneSpaceDim() - 1;
-
- int32 dim = sgmm.FeatureDim();
- kaldi::SgmmGselectConfig config;
- config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss());
- kaldi::Vector<BaseFloat> feat(dim);
- for (int32 d = 0; d < dim; d++) {
- feat(d) = kaldi::RandGauss();
- }
- kaldi::SgmmPerFrameDerivedVars frame_vars;
-
- std::vector<int32> gselect;
- sgmm.GaussianSelection(config, feat, &gselect);
- SgmmPerSpkDerivedVars empty;
- SgmmPerFrameDerivedVars per_frame;
- sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0);
-
- kaldi::Matrix<BaseFloat> norm_xform;
- kaldi::ComputeFeatureNormalizer(sgmm.full_ubm(), &norm_xform);
- AmSgmm *sgmm1 = new AmSgmm();
- sgmm1->CopyFromSgmm(sgmm, false);
- sgmm1->Check(true);
- sgmm1->IncreasePhoneSpaceDim(target_phn_dim, norm_xform);
- sgmm1->ComputeNormalizers();
- sgmm1->Check(true);
-
-
- sgmm1->GaussianSelection(config, feat, &gselect);
- sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike1, 1e-4);
-
- sgmm1->IncreaseSpkSpaceDim(target_spk_dim, norm_xform);
- sgmm1->Check(true);
- sgmm1->GaussianSelection(config, feat, &gselect);
- sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame);
- BaseFloat loglike2 = sgmm1->LogLikelihood(per_frame, 0);
- kaldi::AssertEqual(loglike, loglike2, 1e-4);
- delete sgmm1;
-}
-
-void TestSgmmPreXform(const AmSgmm &sgmm) {
- kaldi::Matrix<BaseFloat> xform, inv_xform;
- kaldi::Vector<BaseFloat> diag_scatter;
- kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
- occs.Set(100);
- sgmm.ComputeFmllrPreXform(occs, &xform, &inv_xform, &diag_scatter);
- int32 dim = xform.NumRows();
- kaldi::SubMatrix<BaseFloat> a_pre(xform, 0, dim, 0, dim),
- a_inv(inv_xform, 0, dim, 0, dim);
- kaldi::Vector<BaseFloat> b_pre(dim), b_inv(dim);
- b_pre.CopyColFromMat(xform, dim);
- b_inv.CopyColFromMat(inv_xform, dim);
- kaldi::Matrix<BaseFloat> res_mat(dim, dim, kaldi::kSetZero);
- res_mat.AddMatMat(1.0, a_pre, kaldi::kNoTrans, a_inv, kaldi::kNoTrans, 0.0);
- KALDI_ASSERT(res_mat.IsUnit(1.0e-6));
- kaldi::Vector<BaseFloat> res_vec(dim, kaldi::kSetZero);
- res_vec.AddMatVec(1.0, a_inv, kaldi::kNoTrans, b_pre, 0.0);
- res_vec.AddVec(1.0, b_inv);
- KALDI_ASSERT(res_vec.IsZero(1.0e-6));
-}
-
-void UnitTestSgmm() {
- size_t dim = 1 + kaldi::RandInt(0, 9); // random dimension of the gmm
- size_t num_comp = 3 + kaldi::RandInt(0, 9); // random number of mixtures;
- // make sure it's more than one or we get errors initializing the SGMM.
- kaldi::FullGmm full_gmm;
- ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
- size_t num_states = 1;
- AmSgmm sgmm;
- kaldi::SgmmGselectConfig config;
- sgmm.InitializeFromFullGmm(full_gmm, num_states, dim+1, 0);
- sgmm.ComputeNormalizers();
- TestSgmmInit(sgmm);
- TestSgmmIO(sgmm);
- TestSgmmSubstates(sgmm);
- TestSgmmIncreaseDim(sgmm);
- TestSgmmPreXform(sgmm);
-}
-
-int main() {
- for (int i = 0; i < 10; i++)
- UnitTestSgmm();
- std::cout << "Test OK.\n";
- return 0;
-}
diff --git a/src/sgmm/am-sgmm.cc b/src/sgmm/am-sgmm.cc
--- a/src/sgmm/am-sgmm.cc
+++ /dev/null
@@ -1,1395 +0,0 @@
-// sgmm/am-sgmm.cc
-
-// Copyright 2009-2011 Microsoft Corporation; Lukas Burget;
-// Saarland University (Author: Arnab Ghoshal);
-// Ondrej Glembek; Yanmin Qian;
-// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey)
-// Liang Lu; Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <functional>
-#include <queue>
-#include "sgmm/am-sgmm.h"
-#include "thread/kaldi-thread.h"
-
-namespace kaldi {
-using std::vector;
-
-// This function needs to be added because std::generate is complaining
-// about RandGauss(), which takes an optional arguments.
-static inline float _RandGauss()
-{
- return RandGauss();
-}
-
-void AmSgmm::Read(std::istream &in_stream, bool binary) {
- int32 num_states, feat_dim, num_gauss;
- std::string token;
-
- ExpectToken(in_stream, binary, "<SGMM>");
- ExpectToken(in_stream, binary, "<NUMSTATES>");
- ReadBasicType(in_stream, binary, &num_states);
- ExpectToken(in_stream, binary, "<DIMENSION>");
- ReadBasicType(in_stream, binary, &feat_dim);
- KALDI_ASSERT(num_states > 0 && feat_dim > 0);
-
- ReadToken(in_stream, binary, &token);
-
- while (token != "</SGMM>") {
- if (token == "<DIAG_UBM>") {
- diag_ubm_.Read(in_stream, binary);
- } else if (token == "<FULL_UBM>") {
- full_ubm_.Read(in_stream, binary);
- } else if (token == "<SigmaInv>") {
- ExpectToken(in_stream, binary, "<NUMGaussians>");
- ReadBasicType(in_stream, binary, &num_gauss);
- SigmaInv_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- SigmaInv_[i].Read(in_stream, binary);
- }
- } else if (token == "<M>") {
- ExpectToken(in_stream, binary, "<NUMGaussians>");
- ReadBasicType(in_stream, binary, &num_gauss);
- M_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- M_[i].Read(in_stream, binary);
- }
- } else if (token == "<N>") {
- ExpectToken(in_stream, binary, "<NUMGaussians>");
- ReadBasicType(in_stream, binary, &num_gauss);
- N_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- N_[i].Read(in_stream, binary);
- }
- } else if (token == "<w>") {
- w_.Read(in_stream, binary);
- } else if (token == "<v>") {
- v_.resize(num_states);
- for (int32 j = 0; j < num_states; j++) {
- v_[j].Read(in_stream, binary);
- }
- } else if (token == "<c>") {
- c_.resize(num_states);
- for (int32 j = 0; j < num_states; j++) {
- c_[j].Read(in_stream, binary);
- }
- } else if (token == "<n>") {
- n_.resize(num_states);
- for (int32 j = 0; j < num_states; j++) {
- n_[j].Read(in_stream, binary);
- }
- // The following are the Gaussian prior parameters for MAP adaptation of M
- // They may be moved to somewhere else eventually.
- } else if (token == "<M_Prior>") {
- ExpectToken(in_stream, binary, "<NUMGaussians>");
- ReadBasicType(in_stream, binary, &num_gauss);
- M_prior_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- M_prior_[i].Read(in_stream, binary);
- }
- } else if (token == "<Row_Cov_Inv>") {
- row_cov_inv_.Read(in_stream, binary);
- } else if (token == "<Col_Cov_Inv>") {
- col_cov_inv_.Read(in_stream, binary);
- } else {
- KALDI_ERR << "Unexpected token '" << token << "' in model file ";
- }
- ReadToken(in_stream, binary, &token);
- }
-
- if (n_.empty()) {
- ComputeNormalizers();
- }
-}
-
-void AmSgmm::Write(std::ostream &out_stream, bool binary,
- SgmmWriteFlagsType write_params) const {
- int32 num_states = NumPdfs(),
- feat_dim = FeatureDim(),
- num_gauss = NumGauss();
-
- WriteToken(out_stream, binary, "<SGMM>");
- if (!binary) out_stream << "\n";
- WriteToken(out_stream, binary, "<NUMSTATES>");
- WriteBasicType(out_stream, binary, num_states);
- WriteToken(out_stream, binary, "<DIMENSION>");
- WriteBasicType(out_stream, binary, feat_dim);
- if (!binary) out_stream << "\n";
-
- if (write_params & kSgmmBackgroundGmms) {
- WriteToken(out_stream, binary, "<DIAG_UBM>");
- diag_ubm_.Write(out_stream, binary);
- WriteToken(out_stream, binary, "<FULL_UBM>");
- full_ubm_.Write(out_stream, binary);
- }
-
- if (write_params & kSgmmGlobalParams) {
- WriteToken(out_stream, binary, "<SigmaInv>");
- WriteToken(out_stream, binary, "<NUMGaussians>");
- WriteBasicType(out_stream, binary, num_gauss);
- if (!binary) out_stream << "\n";
- for (int32 i = 0; i < num_gauss; i++) {
- SigmaInv_[i].Write(out_stream, binary);
- }
- WriteToken(out_stream, binary, "<M>");
- WriteToken(out_stream, binary, "<NUMGaussians>");
- WriteBasicType(out_stream, binary, num_gauss);
- if (!binary) out_stream << "\n";
- for (int32 i = 0; i < num_gauss; i++) {
- M_[i].Write(out_stream, binary);
- }
- if (N_.size() != 0) {
- WriteToken(out_stream, binary, "<N>");
- WriteToken(out_stream, binary, "<NUMGaussians>");
- WriteBasicType(out_stream, binary, num_gauss);
- if (!binary) out_stream << "\n";
- for (int32 i = 0; i < num_gauss; i++) {
- N_[i].Write(out_stream, binary);
- }
- }
- WriteToken(out_stream, binary, "<w>");
- w_.Write(out_stream, binary);
-
- // The following are the Gaussian prior parameters for MAP adaptation of M.
- // They may be moved to somewhere else eventually.
- if (M_prior_.size() != 0) {
- WriteToken(out_stream, binary, "<M_Prior>");
- WriteToken(out_stream, binary, "<NUMGaussians>");
- WriteBasicType(out_stream, binary, num_gauss);
- if (!binary) out_stream << "\n";
- for (int32 i = 0; i < num_gauss; i++) {
- M_prior_[i].Write(out_stream, binary);
- }
-
- KALDI_ASSERT(row_cov_inv_.NumRows() != 0 &&
- "Empty row covariance for MAP prior");
- WriteToken(out_stream, binary, "<Row_Cov_Inv>");
- if (!binary) out_stream << "\n";
- row_cov_inv_.Write(out_stream, binary);
-
- KALDI_ASSERT(col_cov_inv_.NumRows() != 0 &&
- "Empty column covariance for MAP prior");
- WriteToken(out_stream, binary, "<Col_Cov_Inv>");
- if (!binary) out_stream << "\n";
- col_cov_inv_.Write(out_stream, binary);
- }
- // end priors for MAP adaptation
- }
-
- if (write_params & kSgmmStateParams) {
- WriteToken(out_stream, binary, "<v>");
- for (int32 j = 0; j < num_states; j++) {
- v_[j].Write(out_stream, binary);
- }
- WriteToken(out_stream, binary, "<c>");
- for (int32 j = 0; j < num_states; j++) {
- c_[j].Write(out_stream, binary);
- }
- }
-
- if (write_params & kSgmmNormalizers) {
- WriteToken(out_stream, binary, "<n>");
- if (n_.empty())
- KALDI_WARN << "Not writing normalizers since they are not present.";
- else
- for (int32 j = 0; j < num_states; j++)
- n_[j].Write(out_stream, binary);
- }
-
- WriteToken(out_stream, binary, "</SGMM>");
-}
-
-void AmSgmm::Check(bool show_properties) {
- int32 num_states = NumPdfs(),
- num_gauss = NumGauss(),
- feat_dim = FeatureDim(),
- phn_dim = PhoneSpaceDim(),
- spk_dim = SpkSpaceDim();
-
- if (show_properties)
- KALDI_LOG << "AmSgmm: #states = " << num_states << ", #Gaussians = "
- << num_gauss << ", feature dim = " << feat_dim
- << ", phone-space dim =" << phn_dim
- << ", speaker-space dim =" << spk_dim;
- KALDI_ASSERT(num_states > 0 && num_gauss > 0 && feat_dim > 0 && phn_dim > 0);
-
- std::ostringstream debug_str;
-
- // First check the diagonal-covariance UBM.
- KALDI_ASSERT(diag_ubm_.NumGauss() == num_gauss);
- KALDI_ASSERT(diag_ubm_.Dim() == feat_dim);
-
- // Check the full-covariance UBM.
- KALDI_ASSERT(full_ubm_.NumGauss() == num_gauss);
- KALDI_ASSERT(full_ubm_.Dim() == feat_dim);
-
- // Check the globally-shared covariance matrices.
- KALDI_ASSERT(SigmaInv_.size() == static_cast<size_t>(num_gauss));
- for (int32 i = 0; i < num_gauss; i++) {
- KALDI_ASSERT(SigmaInv_[i].NumRows() == feat_dim &&
- SigmaInv_[i](0, 0) > 0.0); // or it wouldn't be +ve definite.
- }
-
- KALDI_ASSERT(M_.size() == static_cast<size_t>(num_gauss));
- for (int32 i = 0; i < num_gauss; i++) {
- KALDI_ASSERT(M_[i].NumRows() == feat_dim && M_[i].NumCols() == phn_dim);
- }
-
- KALDI_ASSERT(w_.NumRows() == num_gauss && w_.NumCols() == phn_dim);
-
- { // check v, c.
- KALDI_ASSERT(v_.size() == static_cast<size_t>(num_states) &&
- c_.size() == static_cast<size_t>(num_states));
- int32 nSubstatesTot = 0;
- for (int32 j = 0; j < num_states; j++) {
- int32 M_j = NumSubstates(j);
- nSubstatesTot += M_j;
- KALDI_ASSERT(M_j > 0 && v_[j].NumRows() == M_j &&
- c_[j].Dim() == M_j && v_[j].NumCols() == phn_dim);
- }
- debug_str << "Substates: "<< (nSubstatesTot) << ". ";
- }
-
- // check n.
- if (n_.size() == 0) {
- debug_str << "Normalizers: no. ";
- } else {
- debug_str << "Normalizers: yes. ";
- KALDI_ASSERT(n_.size() == static_cast<size_t>(num_states));
- for (int32 j = 0; j < num_states; j++) {
- KALDI_ASSERT(n_[j].NumRows() == num_gauss &&
- n_[j].NumCols() == NumSubstates(j));
- }
- }
-
- if (show_properties)
- KALDI_LOG << "Subspace GMM model properties: " << debug_str.str();
-}
-
-void AmSgmm::InitializeFromFullGmm(const FullGmm &full_gmm,
- int32 num_states,
- int32 phn_subspace_dim,
- int32 spk_subspace_dim) {
- full_ubm_.CopyFromFullGmm(full_gmm);
- diag_ubm_.CopyFromFullGmm(full_gmm);
- if (phn_subspace_dim < 1 || phn_subspace_dim > full_gmm.Dim() + 1) {
- KALDI_WARN << "Initial phone-subspace dimension must be in [1, "
- << full_gmm.Dim() + 1 << "]. Changing from " << phn_subspace_dim
- << " to " << full_gmm.Dim() + 1;
- phn_subspace_dim = full_gmm.Dim() + 1;
- }
- if (spk_subspace_dim < 0 || spk_subspace_dim > full_gmm.Dim()) {
- KALDI_WARN << "Initial spk-subspace dimension must be in [1, "
- << full_gmm.Dim() << "]. Changing from " << spk_subspace_dim
- << " to " << full_gmm.Dim();
- spk_subspace_dim = full_gmm.Dim();
- }
- w_.Resize(0, 0);
- N_.clear();
- c_.clear();
- v_.clear();
- SigmaInv_.clear();
-
- KALDI_LOG << "Initializing model";
- Matrix<BaseFloat> norm_xform;
- ComputeFeatureNormalizer(full_gmm, &norm_xform);
- InitializeMw(phn_subspace_dim, norm_xform);
- if (spk_subspace_dim > 0) InitializeN(spk_subspace_dim, norm_xform);
- InitializeVecs(num_states);
- KALDI_LOG << "Initializing variances";
- InitializeCovars();
-}
-
-void AmSgmm::CopyFromSgmm(const AmSgmm &other,
- bool copy_normalizers) {
- KALDI_LOG << "Copying AmSgmm";
-
- // Copy background GMMs
- diag_ubm_.CopyFromDiagGmm(other.diag_ubm_);
- full_ubm_.CopyFromFullGmm(other.full_ubm_);
-
- // Copy global params
- SigmaInv_ = other.SigmaInv_;
- M_ = other.M_;
- w_ = other.w_;
- N_ = other.N_;
-
- // Copy state-specific params, but only copy normalizers if requested.
- v_ = other.v_;
- c_ = other.c_;
- if (copy_normalizers) n_ = other.n_;
-
- KALDI_LOG << "Done.";
-}
-
-void AmSgmm::CopyGlobalsInitVecs(const AmSgmm &other,
- int32 phn_subspace_dim,
- int32 spk_subspace_dim,
- int32 num_pdfs) {
- if (phn_subspace_dim < 1 || phn_subspace_dim > other.PhoneSpaceDim()) {
- KALDI_WARN << "Initial phone-subspace dimension must be in [1, "
- << other.PhoneSpaceDim() << "]. Changing from " << phn_subspace_dim
- << " to " << other.PhoneSpaceDim();
- phn_subspace_dim = other.PhoneSpaceDim();
- }
- if (spk_subspace_dim < 0 || spk_subspace_dim > other.SpkSpaceDim()) {
- KALDI_WARN << "Initial spk-subspace dimension must be in [1, "
- << other.SpkSpaceDim() << "]. Changing from " << spk_subspace_dim
- << " to " << other.SpkSpaceDim();
- spk_subspace_dim = other.SpkSpaceDim();
- }
-
- KALDI_LOG << "Initializing model";
-
- // Copy background GMMs
- diag_ubm_.CopyFromDiagGmm(other.diag_ubm_);
- full_ubm_.CopyFromFullGmm(other.full_ubm_);
-
- // Copy global params
- SigmaInv_ = other.SigmaInv_;
- int32 num_gauss = diag_ubm_.NumGauss(),
- data_dim = other.FeatureDim();
- M_.resize(num_gauss);
- w_.Resize(num_gauss, phn_subspace_dim);
- for (int32 i = 0; i < num_gauss; i++) {
- M_[i].Resize(data_dim, phn_subspace_dim);
- M_[i].CopyFromMat(other.M_[i].Range(0, data_dim, 0, phn_subspace_dim),
- kNoTrans);
- }
- w_.CopyFromMat(other.w_.Range(0, num_gauss, 0, phn_subspace_dim), kNoTrans);
-
- if (spk_subspace_dim > 0) {
- N_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- N_[i].Resize(data_dim, spk_subspace_dim);
- N_[i].CopyFromMat(other.N_[i].Range(0, data_dim, 0, spk_subspace_dim),
- kNoTrans);
- }
- } else {
- N_.clear();
- }
- InitializeVecs(num_pdfs);
-}
-
-
-void AmSgmm::ComputePerFrameVars(const VectorBase<BaseFloat> &data,
- const std::vector<int32> &gselect,
- const SgmmPerSpkDerivedVars &spk_vars,
- BaseFloat logdet_s,
- SgmmPerFrameDerivedVars *per_frame_vars) const {
- KALDI_ASSERT(!n_.empty() && "ComputeNormalizers() must be called.");
-
- if (per_frame_vars->NeedsResizing(gselect.size(),
- FeatureDim(),
- PhoneSpaceDim()))
- per_frame_vars->Resize(gselect.size(), FeatureDim(), PhoneSpaceDim());
-
- per_frame_vars->gselect = gselect;
- per_frame_vars->xt.CopyFromVec(data);
-
- for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
- int32 i = gselect[ki];
- per_frame_vars->xti.Row(ki).CopyFromVec(per_frame_vars->xt);
- if (spk_vars.v_s.Dim() != 0)
- per_frame_vars->xti.Row(ki).AddVec(-1.0, spk_vars.o_s.Row(i));
- }
- Vector<BaseFloat> SigmaInv_xt(FeatureDim());
- for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
- int32 i = gselect[ki];
- SigmaInv_xt.AddSpVec(1.0, SigmaInv_[i], per_frame_vars->xti.Row(ki), 0.0);
- // Eq (35): z_{i}(t) = M_{i}^{T} \Sigma_{i}^{-1} x_{i}(t)
- per_frame_vars->zti.Row(ki).AddMatVec(1.0, M_[i], kTrans, SigmaInv_xt, 0.0);
- // Eq.(36): n_{i}(t) = -0.5 x_{i}^{T} \Sigma_{i}^{-1} x_{i}(t)
- per_frame_vars->nti(ki) = -0.5 * VecVec(per_frame_vars->xti.Row(ki),
- SigmaInv_xt) + logdet_s;
- }
-}
-
-BaseFloat AmSgmm::LogLikelihood(const SgmmPerFrameDerivedVars &per_frame_vars,
- int32 j, BaseFloat log_prune) const {
- KALDI_ASSERT(j < NumPdfs());
- const vector<int32> &gselect = per_frame_vars.gselect;
-
-
- // Eq.(37): log p(x(t), m, i|j) [indexed by j, ki]
- // Although the extra memory allocation of storing this as a
- // matrix might seem unnecessary, we save time in the LogSumExp()
- // via more effective pruning.
- Matrix<BaseFloat> logp_x(gselect.size(), NumSubstates(j));
-
- for (int32 ki = 0, last = gselect.size(); ki < last; ki++) {
- SubVector<BaseFloat> logp_xi(logp_x, ki);
- int32 i = gselect[ki];
- // for all substates, compute z_{i}^T v_{jm}
- logp_xi.AddMatVec(1.0, v_[j], kNoTrans, per_frame_vars.zti.Row(ki), 0.0);
- logp_xi.AddVec(1.0, n_[j].Row(i)); // for all substates, add n_{jim}
- logp_xi.Add(per_frame_vars.nti(ki)); // for all substates, add n_{i}(t)
- }
- // Eq. (38): log p(x(t)|j) = log \sum_{m, i} p(x(t), m, i|j)
- return logp_x.LogSumExp(log_prune);
-}
-
-BaseFloat
-AmSgmm::ComponentPosteriors(const SgmmPerFrameDerivedVars &per_frame_vars,
- int32 j,
- Matrix<BaseFloat> *post) const {
- KALDI_ASSERT(j < NumPdfs());
- if (post == NULL) KALDI_ERR << "NULL pointer passed as return argument.";
- const vector<int32> &gselect = per_frame_vars.gselect;
- int32 num_gselect = gselect.size();
- post->Resize(num_gselect, NumSubstates(j));
-
- // Eq.(37): log p(x(t), m, i|j) = z_{i}^T v_{jm} (for all substates)
- post->AddMatMat(1.0, per_frame_vars.zti, kNoTrans, v_[j], kTrans, 0.0);
- for (int32 ki = 0; ki < num_gselect; ki++) {
- int32 i = gselect[ki];
- // Eq. (37): log p(x(t), m, i|j) += n_{jim} + n_{i}(t) (for all substates)
- post->Row(ki).AddVec(1.0, n_[j].Row(i));
- post->Row(ki).Add(per_frame_vars.nti(ki));
- }
-
- // Eq. (38): log p(x(t)|j) = log \sum_{m, i} p(x(t), m, i|j)
- return post->ApplySoftMax();
-}
-
-struct SubstateCounter {
- SubstateCounter(int32 j, int32 num_substates, BaseFloat occ)
- : state_index(j), num_substates(num_substates), occupancy(occ) {}
-
- int32 state_index;
- int32 num_substates;
- BaseFloat occupancy;
-
- bool operator < (const SubstateCounter &r) const {
- return occupancy/num_substates < r.occupancy/r.num_substates;
- }
-};
-
-void AmSgmm::SplitSubstates(const Vector<BaseFloat> &state_occupancies,
- int32 target_nsubstates, BaseFloat perturb,
- BaseFloat power, BaseFloat max_cond) {
- // power == p in document. target_nsubstates == T in document.
- KALDI_ASSERT(state_occupancies.Dim() == NumPdfs());
- int32 tot_n_substates_old = 0;
- int32 phn_dim = PhoneSpaceDim();
- std::priority_queue<SubstateCounter> substate_counts;
- vector< SpMatrix<BaseFloat> > H_i;
- SpMatrix<BaseFloat> sqrt_H_sm;
- Vector<BaseFloat> rand_vec(phn_dim), v_shift(phn_dim);
-
- for (int32 j = 0; j < NumPdfs(); j++) {
- BaseFloat gamma_p = pow(state_occupancies(j), power);
- substate_counts.push(SubstateCounter(j, NumSubstates(j), gamma_p));
- tot_n_substates_old += NumSubstates(j);
- }
- if (target_nsubstates <= tot_n_substates_old || tot_n_substates_old == 0) {
- KALDI_WARN << "Cannot split from " << (tot_n_substates_old) <<
- " to " << (target_nsubstates) << " substates.";
- return;
- }
-
- ComputeH(&H_i); // set up that array.
- ComputeSmoothingTermsFromModel(H_i, state_occupancies, &sqrt_H_sm, max_cond);
- H_i.clear();
- sqrt_H_sm.ApplyPow(-0.5);
-
- for (int32 n_states = tot_n_substates_old;
- n_states < target_nsubstates; n_states++) {
- SubstateCounter state_to_split = substate_counts.top();
- substate_counts.pop();
- state_to_split.num_substates++;
- substate_counts.push(state_to_split);
- }
-
- while (!substate_counts.empty()) {
- int32 j = substate_counts.top().state_index;
- int32 tgt_n_substates_j = substate_counts.top().num_substates;
- int32 n_substates_j = NumSubstates(j);
- substate_counts.pop();
-
- if (n_substates_j == tgt_n_substates_j) continue;
-
- // Resize v[j] and c[j] to fit new substates
- Matrix<BaseFloat> tmp_v_j(v_[j]);
- v_[j].Resize(tgt_n_substates_j, phn_dim);
- v_[j].Range(0, n_substates_j, 0, phn_dim).CopyFromMat(tmp_v_j);
- tmp_v_j.Resize(0, 0);
-
- Vector<BaseFloat> tmp_c_j(c_[j]);
- c_[j].Resize(tgt_n_substates_j);
- c_[j].Range(0, n_substates_j).CopyFromVec(tmp_c_j);
- tmp_c_j.Resize(0);
-
- // Keep splitting substates until obtaining the desired number
- for (; n_substates_j < tgt_n_substates_j; n_substates_j++) {
- int32 split_substate = std::max_element(c_[j].Data(), c_[j].Data()
- + n_substates_j) - c_[j].Data();
-
- // c_{jkm} := c_{jmk}' := c_{jkm} / 2
- c_[j](split_substate) = c_[j](n_substates_j) = c_[j](split_substate) / 2;
-
- // v_{jkm} := +/- split_perturb * H_k^{(sm)}^{-0.5} * rand_vec
- std::generate(rand_vec.Data(), rand_vec.Data() + rand_vec.Dim(),
- _RandGauss);
- v_shift.AddSpVec(perturb, sqrt_H_sm, rand_vec, 0.0);
- v_[j].Row(n_substates_j).CopyFromVec(v_[j].Row(split_substate));
- v_[j].Row(n_substates_j).AddVec(1.0, v_shift);
- v_[j].Row(split_substate).AddVec((-1.0), v_shift);
- }
- }
- KALDI_LOG << "Getting rid of normalizers as they will no longer be valid";
-
- n_.clear();
- KALDI_LOG << "Split " << (tot_n_substates_old) << " substates to "
- << (target_nsubstates);
-}
-
-void AmSgmm::IncreasePhoneSpaceDim(int32 target_dim,
- const Matrix<BaseFloat> &norm_xform) {
- KALDI_ASSERT(!M_.empty());
- int32 initial_dim = PhoneSpaceDim(),
- feat_dim = FeatureDim();
- KALDI_ASSERT(norm_xform.NumRows() == feat_dim);
-
- if (target_dim < initial_dim)
- KALDI_ERR << "You asked to increase phn dim to a value lower than the "
- << " current dimension, " << target_dim << " < " << initial_dim;
-
- if (target_dim > initial_dim + feat_dim) {
- KALDI_WARN << "Cannot increase phone subspace dimensionality from "
- << initial_dim << " to " << target_dim << ", increasing to "
- << initial_dim + feat_dim;
- target_dim = initial_dim + feat_dim;
- }
-
- if (initial_dim < target_dim) {
- Matrix<BaseFloat> tmp_M(feat_dim, initial_dim);
- for (int32 i = 0; i < NumGauss(); i++) {
- tmp_M.CopyFromMat(M_[i]);
- M_[i].Resize(feat_dim, target_dim);
- M_[i].Range(0, feat_dim, 0, tmp_M.NumCols()).CopyFromMat(tmp_M);
- M_[i].Range(0, feat_dim, tmp_M.NumCols(),
- target_dim - tmp_M.NumCols()).CopyFromMat(norm_xform.Range(0,
- feat_dim, 0, target_dim-tmp_M.NumCols()));
- }
- Matrix<BaseFloat> tmp_w = w_;
- w_.Resize(tmp_w.NumRows(), target_dim);
- w_.Range(0, tmp_w.NumRows(), 0, tmp_w.NumCols()).CopyFromMat(tmp_w);
-
- for (int32 j = 0; j < NumPdfs(); j++) {
- // Resize v[j]
- Matrix<BaseFloat> tmp_v_j = v_[j];
- v_[j].Resize(tmp_v_j.NumRows(), target_dim);
- v_[j].Range(0, tmp_v_j.NumRows(), 0, tmp_v_j.NumCols()).CopyFromMat(
- tmp_v_j);
- }
- KALDI_LOG << "Phone subspace dimensionality increased from " <<
- initial_dim << " to " << target_dim;
- } else {
- KALDI_LOG << "Phone subspace dimensionality unchanged, since target " <<
- "dimension (" << target_dim << ") <= initial dimansion (" <<
- initial_dim << ")";
- }
-}
-
-void AmSgmm::IncreaseSpkSpaceDim(int32 target_dim,
- const Matrix<BaseFloat> &norm_xform) {
- int32 initial_dim = SpkSpaceDim(),
- feat_dim = FeatureDim();
- KALDI_ASSERT(norm_xform.NumRows() == feat_dim);
-
- if (N_.size() == 0)
- N_.resize(NumGauss());
-
- if (target_dim < initial_dim)
- KALDI_ERR << "You asked to increase spk dim to a value lower than the "
- << " current dimension, " << target_dim << " < " << initial_dim;
-
- if (target_dim > initial_dim + feat_dim) {
- KALDI_WARN << "Cannot increase speaker subspace dimensionality from "
- << initial_dim << " to " << target_dim << ", increasing to "
- << initial_dim + feat_dim;
- target_dim = initial_dim + feat_dim;
- }
-
- if (initial_dim < target_dim) {
- int32 dim_change = target_dim - initial_dim;
- Matrix<BaseFloat> tmp_N((initial_dim != 0) ? feat_dim : 0,
- initial_dim);
- for (int32 i = 0; i < NumGauss(); i++) {
- if (initial_dim != 0) tmp_N.CopyFromMat(N_[i]);
- N_[i].Resize(feat_dim, target_dim);
- if (initial_dim != 0) {
- N_[i].Range(0, feat_dim, 0, tmp_N.NumCols()).CopyFromMat(tmp_N);
- }
- N_[i].Range(0, feat_dim, tmp_N.NumCols(), dim_change).CopyFromMat(
- norm_xform.Range(0, feat_dim, 0, dim_change));
- }
- KALDI_LOG << "Speaker subspace dimensionality increased from " <<
- initial_dim << " to " << target_dim;
- } else {
- KALDI_LOG << "Speaker subspace dimensionality unchanged, since target " <<
- "dimension (" << target_dim << ") <= initial dimansion (" <<
- initial_dim << ")";
- }
-}
-
-void AmSgmm::ComputeDerivedVars() {
- if (n_.empty()) {
- ComputeNormalizers();
- }
- if (diag_ubm_.NumGauss() != full_ubm_.NumGauss()
- || diag_ubm_.Dim() != full_ubm_.Dim()) {
- diag_ubm_.CopyFromFullGmm(full_ubm_);
- }
-}
-
-class ComputeNormalizersClass: public MultiThreadable { // For multi-threaded.
- public:
- ComputeNormalizersClass(AmSgmm *am_sgmm,
- int32 *entropy_count_ptr,
- double *entropy_sum_ptr):
- am_sgmm_(am_sgmm), entropy_count_ptr_(entropy_count_ptr),
- entropy_sum_ptr_(entropy_sum_ptr), entropy_count_(0),
- entropy_sum_(0.0) { }
-
- ~ComputeNormalizersClass() {
- *entropy_count_ptr_ += entropy_count_;
- *entropy_sum_ptr_ += entropy_sum_;
- }
-
- inline void operator() () {
- // Note: give them local copy of the sums we're computing,
- // which will be propagated to original pointer in the destructor.
- am_sgmm_->ComputeNormalizersInternal(num_threads_, thread_id_,
- &entropy_count_,
- &entropy_sum_);
- }
- private:
- ComputeNormalizersClass() { } // Disallow empty constructor.
- AmSgmm *am_sgmm_;
- int32 *entropy_count_ptr_;
- double *entropy_sum_ptr_;
- int32 entropy_count_;
- double entropy_sum_;
-
-};
-
-void AmSgmm::ComputeNormalizers() {
- KALDI_LOG << "Computing normalizers";
- n_.resize(NumPdfs());
- int32 entropy_count = 0;
- double entropy_sum = 0.0;
- ComputeNormalizersClass c(this, &entropy_count, &entropy_sum);
- RunMultiThreaded(c);
-
- KALDI_LOG << "Entropy of weights in substates is "
- << (entropy_sum / entropy_count) << " over " << entropy_count
- << " substates, equivalent to perplexity of "
- << (Exp(entropy_sum /entropy_count));
- KALDI_LOG << "Done computing normalizers";
-}
-
-
-void AmSgmm::ComputeNormalizersInternal(int32 num_threads, int32 thread,
- int32 *entropy_count,
- double *entropy_sum) {
-
- BaseFloat DLog2pi = FeatureDim() * Log(2 * M_PI);
- Vector<BaseFloat> log_det_Sigma(NumGauss());
-
- for (int32 i = 0; i < NumGauss(); i++) {
- try {
- log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet();
- } catch(...) {
- if (thread == 0) // just for one thread, print errors [else, duplicates]
- KALDI_WARN << "Covariance is not positive definite, setting to unit";
- SigmaInv_[i].SetUnit();
- log_det_Sigma(i) = 0.0;
- }
- }
-
-
- int block_size = (NumPdfs() + num_threads-1) / num_threads;
- int j_start = thread * block_size, j_end = std::min(NumPdfs(), j_start + block_size);
-
- for (int32 j = j_start; j < j_end; j++) {
- Matrix<BaseFloat> log_w_jm(NumSubstates(j), NumGauss());
- n_[j].Resize(NumGauss(), NumSubstates(j));
- Matrix<BaseFloat> mu_jmi(NumSubstates(j), FeatureDim());
- Matrix<BaseFloat> SigmaInv_mu(NumSubstates(j), FeatureDim());
-
- // (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- log_w_jm.AddMatMat(1.0, v_[j], kNoTrans, w_, kTrans, 0.0);
- for (int32 m = 0; m < NumSubstates(j); m++) {
- log_w_jm.Row(m).Add(-1.0 * log_w_jm.Row(m).LogSumExp());
- { // DIAGNOSTIC CODE
- (*entropy_count)++;
- for (int32 i = 0; i < NumGauss(); i++) {
- (*entropy_sum) -= log_w_jm(m, i) * Exp(log_w_jm(m, i));
- }
- }
- }
-
- for (int32 i = 0; i < NumGauss(); i++) {
- // mu_jmi = M_{i} * v_{jm}
- mu_jmi.AddMatMat(1.0, v_[j], kNoTrans, M_[i], kTrans, 0.0);
- SigmaInv_mu.AddMatSp(1.0, mu_jmi, kNoTrans, SigmaInv_[i], 0.0);
-
- for (int32 m = 0; m < NumSubstates(j); m++) {
- // mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi}
- BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi.Row(m), SigmaInv_mu.Row(m));
- BaseFloat logc = Log(c_[j](m));
-
- // Suggestion: Both mu_jmi and SigmaInv_mu could
- // have been computed at once for i,
- // if M[i] was concatenated to single matrix over i indices
-
- // eq.(31)
- n_[j](i, m) = logc + log_w_jm(m, i) - 0.5 * (log_det_Sigma(i) + DLog2pi
- + mu_SigmaInv_mu);
- { // Mainly diagnostic code. Not necessary.
- BaseFloat tmp = n_[j](i, m);
- if (!KALDI_ISFINITE(tmp)) { // NaN or inf
- KALDI_LOG << "Warning: normalizer for j = " << j << ", m = " << m
- << ", i = " << i << " is infinite or NaN " << tmp << "= "
- << (logc) << "+" << (log_w_jm(m, i)) << "+" << (-0.5 *
- log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi)
- << "+" << (mu_SigmaInv_mu) << ", setting to finite.";
- n_[j](i, m) = -1.0e+40; // future work(arnab): get rid of magic number
- }
- }
- }
- }
- }
-}
-
-
-void AmSgmm::ComputeNormalizersNormalized(
- const std::vector< std::vector<int32> > &normalize_sets) {
- { // Check sets in normalize_sets are disjoint and cover all Gaussians.
- std::set<int32> all;
- for (int32 i = 0; i < normalize_sets.size(); i++)
- for (int32 j = 0; static_cast<size_t>(j) < normalize_sets[i].size(); j++) {
- int32 n = normalize_sets[i][j];
- KALDI_ASSERT(all.count(n) == 0 && n >= 0 && n < NumGauss());
- all.insert(n);
- }
- KALDI_ASSERT(all.size() == NumGauss());
- }
-
- KALDI_LOG << "Computing normalizers [normalized]";
- BaseFloat DLog2pi = FeatureDim() * Log(2 * M_PI);
- Vector<BaseFloat> mu_jmi(FeatureDim());
- Vector<BaseFloat> SigmaInv_mu(FeatureDim());
- Vector<BaseFloat> log_det_Sigma(NumGauss());
-
- for (int32 i = 0; i < NumGauss(); i++) {
- try {
- log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet();
- } catch(...) {
- KALDI_WARN << "Covariance is not positive definite, setting to unit";
- SigmaInv_[i].SetUnit();
- log_det_Sigma(i) = 0.0;
- }
- }
-
- n_.resize(NumPdfs());
- for (int32 j = 0; j < NumPdfs(); j++) {
- Vector<BaseFloat> log_w_jm(NumGauss());
-
- n_[j].Resize(NumGauss(), NumSubstates(j));
- for (int32 m = 0; m < NumSubstates(j); m++) {
- BaseFloat logc = Log(c_[j](m));
-
- // (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- log_w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0);
- log_w_jm.Add((-1.0) * log_w_jm.LogSumExp());
-
- for (int32 n = 0; n < normalize_sets.size(); n++) {
- const std::vector<int32> &this_set(normalize_sets[n]);
- double sum = 0.0;
- for (int32 p = 0; p < this_set.size(); p++)
- sum += Exp(log_w_jm(this_set[p]));
- double offset = -Log(sum); // add "offset", to normalize weights.
- for (int32 p = 0; p < this_set.size(); p++)
- log_w_jm(this_set[p]) += offset;
- }
-
- for (int32 i = 0; i < NumGauss(); i++) {
- // mu_jmi = M_{i} * v_{jm}
- mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0);
-
- // mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi}
- SigmaInv_mu.AddSpVec(1.0, SigmaInv_[i], mu_jmi, 0.0);
- BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi, SigmaInv_mu);
-
- // Suggestion: Both mu_jmi and SigmaInv_mu could
- // have been computed at once for i ,
- // if M[i] was concatenated to single matrix over i indeces
-
- // eq.(31)
- n_[j](i, m) = logc + log_w_jm(i) - 0.5 * (log_det_Sigma(i) + DLog2pi
- + mu_SigmaInv_mu);
- { // Mainly diagnostic code. Not necessary.
- BaseFloat tmp = n_[j](i, m);
- if (!KALDI_ISFINITE(tmp)) { // NaN or inf
- KALDI_LOG << "Warning: normalizer for j = " << j << ", m = " << m
- << ", i = " << i << " is infinite or NaN " << tmp << "= "
- << (logc) << "+" << (log_w_jm(i)) << "+" << (-0.5 *
- log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi)
- << "+" << (mu_SigmaInv_mu) << ", setting to finite.";
- n_[j](i, m) = -1.0e+40; // future work(arnab): get rid of magic number
- }
- }
- }
- }
- }
-
- KALDI_LOG << "Done computing normalizers (normalized over subsets)";
-}
-
-
-void AmSgmm::ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
- Matrix<BaseFloat> *xform, Matrix<BaseFloat> *inv_xform,
- Vector<BaseFloat> *diag_mean_scatter) const {
- int32 num_states = NumPdfs(),
- num_gauss = NumGauss(),
- dim = FeatureDim();
- KALDI_ASSERT(state_occs.Dim() == num_states);
-
- BaseFloat total_occ = state_occs.Sum();
-
- // Degenerate case: unlikely to ever happen.
- if (total_occ == 0) {
- KALDI_WARN << "Zero probability (computing transform). Using unit "
- << "pre-transform";
- xform->Resize(dim, dim + 1, kUndefined);
- xform->SetUnit();
- inv_xform->Resize(dim, dim + 1, kUndefined);
- inv_xform->SetUnit();
- diag_mean_scatter->Resize(dim, kSetZero);
- return;
- }
-
- // Convert state occupancies to posteriors; Eq. (B.1)
- Vector<BaseFloat> state_posteriors(state_occs);
- state_posteriors.Scale(1/total_occ);
-
- Vector<BaseFloat> mu_jmi(dim), global_mean(dim);
- SpMatrix<BaseFloat> within_class_covar(dim), between_class_covar(dim);
- Vector<BaseFloat> gauss_weight(num_gauss); // weights for within-class vars.
- Vector<BaseFloat> w_jm(num_gauss);
- BaseFloat substate_weight;
- for (int32 j = 0; j < num_states; j++) {
- for (int32 m = 0; m < NumSubstates(j); m++) {
- // Eq. (7): w_jm = softmax([w_{1}^T ... w_{D}^T] * v_{jm})
- w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0);
- w_jm.ApplySoftMax();
-
- for (int32 i = 0; i < num_gauss; i++) {
- substate_weight = state_posteriors(j) * c_[j](m) * w_jm(i);
- mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0); // Eq. (6)
- // Eq. (B.3): \mu_avg = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi}
- global_mean.AddVec(substate_weight, mu_jmi);
- // \Sigma_B = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi} \mu_{jmi}^T
- between_class_covar.AddVec2(substate_weight, mu_jmi); // Eq. (B.4)
- gauss_weight(i) += substate_weight;
- }
- }
- }
- between_class_covar.AddVec2(-1.0, global_mean); // Eq. (B.4)
-
- for (int32 i = 0; i < num_gauss; i++) {
- SpMatrix<BaseFloat> Sigma(SigmaInv_[i]);
- Sigma.InvertDouble();
- // Eq. (B.2): \Sigma_W = \sum_{jmi} p(j) c_{jm} w_{jmi} \Sigma_i
- within_class_covar.AddSp(gauss_weight(i), Sigma);
- }
-
- TpMatrix<BaseFloat> tmpL(dim);
- Matrix<BaseFloat> tmpLInvFull(dim, dim);
- tmpL.Cholesky(within_class_covar); // \Sigma_W = L L^T
- tmpL.InvertDouble(); // L^{-1}
- tmpLInvFull.CopyFromTp(tmpL); // get as full matrix.
-
- // B := L^{-1} * \Sigma_B * L^{-T}
- SpMatrix<BaseFloat> tmpB(dim);
- tmpB.AddMat2Sp(1.0, tmpLInvFull, kNoTrans, between_class_covar, 0.0);
-
- Matrix<BaseFloat> U(dim, dim);
- diag_mean_scatter->Resize(dim);
- xform->Resize(dim, dim + 1);
- inv_xform->Resize(dim, dim + 1);
-
- tmpB.Eig(diag_mean_scatter, &U); // Eq. (B.5): B = U D V^T
- int32 n;
- if ((n = diag_mean_scatter->ApplyFloor(1.0e-04)) != 0)
- KALDI_WARN << "Floored " << n << " elements of the mean-scatter matrix.";
-
- // Eq. (B.6): A_{pre} = U^T * L^{-1}
- SubMatrix<BaseFloat> Apre(*xform, 0, dim, 0, dim);
- Apre.AddMatMat(1.0, U, kTrans, tmpLInvFull, kNoTrans, 0.0);
-
-#ifdef KALDI_PARANOID
- {
- SpMatrix<BaseFloat> tmp(dim);
- tmp.AddMat2Sp(1.0, Apre, kNoTrans, within_class_covar, 0.0);
- KALDI_ASSERT(tmp.IsUnit(0.01));
- }
- {
- SpMatrix<BaseFloat> tmp(dim);
- tmp.AddMat2Sp(1.0, Apre, kNoTrans, between_class_covar, 0.0);
- KALDI_ASSERT(tmp.IsDiagonal(0.01));
- }
-#endif
-
- // Eq. (B.7): b_{pre} = - A_{pre} \mu_{avg}
- Vector<BaseFloat> b_pre(dim);
- b_pre.AddMatVec(-1.0, Apre, kNoTrans, global_mean, 0.0);
- for (int32 r = 0; r < dim; r++) {
- xform->Row(r)(dim) = b_pre(r); // W_{pre} = [ A_{pre}, b_{pre} ]
- }
-
- // Eq. (B.8) & (B.9): W_{inv} = [ A_{pre}^{-1}, \mu_{avg} ]
- inv_xform->CopyFromMat(*xform);
- inv_xform->Range(0, dim, 0, dim).InvertDouble();
- for (int32 r = 0; r < dim; r++)
- inv_xform->Row(r)(dim) = global_mean(r);
-} // End of ComputePreXform()
-
-template<typename Real>
-void AmSgmm::GetNtransSigmaInv(vector< Matrix<Real> > *out) const {
- KALDI_ASSERT(SpkSpaceDim() > 0 &&
- "Cannot compute N^{T} \\Sigma_{i}^{-1} without speaker projections.");
- out->resize(NumGauss());
- Matrix<Real> tmpcov(FeatureDim(), FeatureDim());
- Matrix<Real> tmp_n(FeatureDim(), SpkSpaceDim());
- for (int32 i = 0; i < NumGauss(); i++) {
- tmpcov.CopyFromSp(SigmaInv_[i]);
- tmp_n.CopyFromMat(N_[i]);
- (*out)[i].Resize(SpkSpaceDim(), FeatureDim());
- (*out)[i].AddMatMat(1.0, tmp_n, kTrans, tmpcov, kNoTrans, 0.0);
- }
-}
-
-// Instantiate the above template.
-template
-void AmSgmm::GetNtransSigmaInv(vector< Matrix<float> > *out) const;
-template
-void AmSgmm::GetNtransSigmaInv(vector< Matrix<double> > *out) const;
-
-///////////////////////////////////////////////////////////////////////////////
-
-template<class Real>
-void AmSgmm::ComputeH(std::vector< SpMatrix<Real> > *H_i) const {
- KALDI_ASSERT(NumGauss() != 0);
- (*H_i).resize(NumGauss());
- SpMatrix<BaseFloat> H_i_tmp(PhoneSpaceDim());
- for (int32 i = 0; i < NumGauss(); i++) {
- (*H_i)[i].Resize(PhoneSpaceDim());
- H_i_tmp.AddMat2Sp(1.0, M_[i], kTrans, SigmaInv_[i], 0.0);
- (*H_i)[i].CopyFromSp(H_i_tmp);
- }
-}
-
-// Instantiate the template.
-template
-void AmSgmm::ComputeH(std::vector< SpMatrix<float> > *H_i) const;
-template
-void AmSgmm::ComputeH(std::vector< SpMatrix<double> > *H_i) const;
-
-
-// Initializes the matrices M_{i} and w_i
-void AmSgmm::InitializeMw(int32 phn_subspace_dim,
- const Matrix<BaseFloat> &norm_xform) {
- int32 ddim = full_ubm_.Dim();
- KALDI_ASSERT(phn_subspace_dim <= ddim + 1);
- KALDI_ASSERT(phn_subspace_dim <= norm_xform.NumCols() + 1);
- KALDI_ASSERT(ddim <= norm_xform.NumRows());
-
- Vector<BaseFloat> mean(ddim);
- int32 num_gauss = full_ubm_.NumGauss();
- w_.Resize(num_gauss, phn_subspace_dim);
- M_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- full_ubm_.GetComponentMean(i, &mean);
- Matrix<BaseFloat> &thisM(M_[i]);
- thisM.Resize(ddim, phn_subspace_dim);
- // Eq. (27): M_{i} = [ \bar{\mu}_{i} (J)_{1:D, 1:(S-1)}]
- thisM.CopyColFromVec(mean, 0);
- thisM.Range(0, ddim, 1, phn_subspace_dim-1).CopyFromMat(
- norm_xform.Range(0, ddim, 0, phn_subspace_dim-1), kNoTrans);
- }
-}
-
-// Initializes the matrices N_{i}
-void AmSgmm::InitializeN(int32 spk_subspace_dim,
- const Matrix<BaseFloat> &norm_xform) {
- int32 ddim = full_ubm_.Dim();
- KALDI_ASSERT(spk_subspace_dim <= ddim);
- KALDI_ASSERT(spk_subspace_dim <= norm_xform.NumCols());
- KALDI_ASSERT(ddim <= norm_xform.NumRows());
-
- int32 num_gauss = full_ubm_.NumGauss();
- N_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- N_[i].Resize(ddim, spk_subspace_dim);
- // Eq. (28): N_{i} = [ (J)_{1:D, 1:T)}]
- N_[i].CopyFromMat(norm_xform.Range(0, ddim, 0, spk_subspace_dim), kNoTrans);
- }
-}
-
-// Initializes the vectors v_{jm}
-void AmSgmm::InitializeVecs(int32 num_states) {
- KALDI_ASSERT(num_states >= 0);
- int32 phn_subspace_dim = PhoneSpaceDim();
- KALDI_ASSERT(phn_subspace_dim > 0 && "Initialize M and w first.");
-
- v_.resize(num_states);
- c_.resize(num_states);
- for (int32 j = 0; j < num_states; j++) {
- v_[j].Resize(1, phn_subspace_dim);
- c_[j].Resize(1);
- v_[j](0, 0) = 1.0; // Eq. (26): v_{j1} = [1 0 0 ... 0]
- c_[j](0) = 1.0; // Eq. (25): c_{j1} = 1.0
- }
-}
-
-// Initializes the within-class vars Sigma_{ki}
-void AmSgmm::InitializeCovars() {
- std::vector< SpMatrix<BaseFloat> > &inv_covars(full_ubm_.inv_covars());
- int32 num_gauss = full_ubm_.NumGauss();
- int32 dim = full_ubm_.Dim();
- SigmaInv_.resize(num_gauss);
- for (int32 i = 0; i < num_gauss; i++) {
- SigmaInv_[i].Resize(dim);
- SigmaInv_[i].CopyFromSp(inv_covars[i]);
- }
-}
-
-// Compute the "smoothing" matrices from expected counts given the model.
-void AmSgmm::ComputeSmoothingTermsFromModel(
- const std::vector< SpMatrix<BaseFloat> > &H,
- const Vector<BaseFloat> &state_occupancies, SpMatrix<BaseFloat> *H_sm,
- BaseFloat max_cond) const {
- int32 num_gauss = NumGauss();
- BaseFloat tot_sum = 0.0;
- KALDI_ASSERT(state_occupancies.Dim() == NumPdfs());
- Vector<BaseFloat> w_jm(num_gauss);
- H_sm->Resize(PhoneSpaceDim());
- H_sm->SetZero();
- Vector<BaseFloat> gamma_i(num_gauss);
- gamma_i.SetZero();
- for (int32 j = 0; j < NumPdfs(); j++) {
- int32 M_j = NumSubstates(j);
- KALDI_ASSERT(M_j > 0);
- for (int32 m = 0; m < M_j; m++) {
- w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0);
- w_jm.ApplySoftMax();
- gamma_i.AddVec(state_occupancies(j) * c_[j](m), w_jm);
- }
- }
- BaseFloat sum = 0.0;
- for (int32 i = 0; i < num_gauss; i++) {
- if (gamma_i(i) > 0) {
- H_sm->AddSp(gamma_i(i), H[i]);
- sum += gamma_i(i);
- }
- }
- if (sum == 0.0) {
- KALDI_WARN << "Sum of counts is zero. ";
- // set to unit matrix--arbitrary non-singular matrix.. won't ever matter.
- H_sm->SetUnit();
- } else {
- H_sm->Scale(1.0 / sum);
- int32 tmp = H_sm->LimitCondDouble(max_cond);
- if (tmp > 0) {
- KALDI_WARN << "Limited " << (tmp) << " eigenvalues of H_sm";
- }
- }
- tot_sum += sum;
-
- KALDI_LOG << "ComputeSmoothingTermsFromModel: total count is " << tot_sum;
-}
-
-void ComputeFeatureNormalizer(const FullGmm &gmm, Matrix<BaseFloat> *xform) {
- int32 dim = gmm.Dim();
- int32 num_gauss = gmm.NumGauss();
- SpMatrix<BaseFloat> within_class_covar(dim);
- SpMatrix<BaseFloat> between_class_covar(dim);
- Vector<BaseFloat> global_mean(dim);
-
- // Accumulate LDA statistics from the GMM parameters.
- {
- BaseFloat total_weight = 0.0;
- Vector<BaseFloat> tmp_weight(num_gauss);
- Matrix<BaseFloat> tmp_means;
- std::vector< SpMatrix<BaseFloat> > tmp_covars;
- tmp_weight.CopyFromVec(gmm.weights());
- gmm.GetCovarsAndMeans(&tmp_covars, &tmp_means);
- for (int32 i = 0; i < num_gauss; i++) {
- BaseFloat w_i = tmp_weight(i);
- total_weight += w_i;
- within_class_covar.AddSp(w_i, tmp_covars[i]);
- between_class_covar.AddVec2(w_i, tmp_means.Row(i));
- global_mean.AddVec(w_i, tmp_means.Row(i));
- }
- KALDI_ASSERT(total_weight > 0);
- if (fabs(total_weight - 1.0) > 0.001) {
- KALDI_WARN << "Total weight across the GMMs is " << (total_weight)
- << ", renormalizing.";
- global_mean.Scale(1.0 / total_weight);
- within_class_covar.Scale(1.0 / total_weight);
- between_class_covar.Scale(1.0 / total_weight);
- }
- between_class_covar.AddVec2(-1.0, global_mean);
- }
-
- TpMatrix<BaseFloat> chol(dim);
- chol.Cholesky(within_class_covar); // Sigma_W = L L^T
- TpMatrix<BaseFloat> chol_inv(chol);
- chol_inv.InvertDouble();
- Matrix<BaseFloat> chol_full(dim, dim);
- chol_full.CopyFromTp(chol_inv);
- SpMatrix<BaseFloat> LBL(dim);
- // LBL = L^{-1} \Sigma_B L^{-T}
- LBL.AddMat2Sp(1.0, chol_full, kNoTrans, between_class_covar, 0.0);
- Vector<BaseFloat> Dvec(dim);
- Matrix<BaseFloat> U(dim, dim);
- LBL.Eig(&Dvec, &U);
- SortSvd(&Dvec, &U);
-
- xform->Resize(dim, dim);
- chol_full.CopyFromTp(chol);
- // T := L U, eq (23)
- xform->AddMatMat(1.0, chol_full, kNoTrans, U, kNoTrans, 0.0);
-
-#ifdef KALDI_PARANOID
- Matrix<BaseFloat> inv_xform(*xform);
- inv_xform.InvertDouble();
- { // Check that T*within_class_covar*T' = I.
- Matrix<BaseFloat> wc_covar_full(dim, dim), tmp(dim, dim);
- wc_covar_full.CopyFromSp(within_class_covar);
- tmp.AddMatMat(1.0, inv_xform, kNoTrans, wc_covar_full, kNoTrans, 0.0);
- wc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0);
- KALDI_ASSERT(wc_covar_full.IsUnit(0.01));
- }
- { // Check that T*between_class_covar*T' = diagonal.
- Matrix<BaseFloat> bc_covar_full(dim, dim), tmp(dim, dim);
- bc_covar_full.CopyFromSp(between_class_covar);
- tmp.AddMatMat(1.0, inv_xform, kNoTrans, bc_covar_full, kNoTrans, 0.0);
- bc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0);
- KALDI_ASSERT(bc_covar_full.IsDiagonal(0.01));
- }
-#endif
-}
-
-void AmSgmm::ComputePerSpkDerivedVars(SgmmPerSpkDerivedVars *vars) const {
- KALDI_ASSERT(vars != NULL);
- if (vars->v_s.Dim() != 0) {
- KALDI_ASSERT(vars->v_s.Dim() == SpkSpaceDim());
- vars->o_s.Resize(NumGauss(), FeatureDim());
- int32 num_gauss = NumGauss();
- for (int32 i = 0; i < num_gauss; i++) {
- // Eqn. (32): o_i^{(s)} = N_i v^{(s)}
- vars->o_s.Row(i).AddMatVec(1.0, N_[i], kNoTrans, vars->v_s, 0.0);
- }
- } else {
- vars->o_s.Resize(0, 0);
- }
-}
-
-BaseFloat AmSgmm::GaussianSelection(const SgmmGselectConfig &config,
- const VectorBase<BaseFloat> &data,
- std::vector<int32> *gselect) const {
- KALDI_ASSERT(diag_ubm_.NumGauss() != 0 &&
- diag_ubm_.NumGauss() == full_ubm_.NumGauss() &&
- diag_ubm_.Dim() == data.Dim());
- KALDI_ASSERT(config.diag_gmm_nbest > 0 && config.full_gmm_nbest > 0 &&
- config.full_gmm_nbest < config.diag_gmm_nbest);
- int32 num_gauss = diag_ubm_.NumGauss();
-
- std::vector< std::pair<BaseFloat, int32> > pruned_pairs;
- if (config.diag_gmm_nbest < num_gauss) {
- Vector<BaseFloat> loglikes(num_gauss);
- diag_ubm_.LogLikelihoods(data, &loglikes);
- Vector<BaseFloat> loglikes_copy(loglikes);
- BaseFloat *ptr = loglikes_copy.Data();
- std::nth_element(ptr, ptr+num_gauss-config.diag_gmm_nbest, ptr+num_gauss);
- BaseFloat thresh = ptr[num_gauss-config.diag_gmm_nbest];
- for (int32 g = 0; g < num_gauss; g++)
- if (loglikes(g) >= thresh) // met threshold for diagonal phase.
- pruned_pairs.push_back(
- std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g));
- } else {
- Vector<BaseFloat> loglikes(num_gauss);
- full_ubm_.LogLikelihoods(data, &loglikes);
- for (int32 g = 0; g < num_gauss; g++)
- pruned_pairs.push_back(std::make_pair(loglikes(g), g));
- }
- KALDI_ASSERT(!pruned_pairs.empty());
- if (pruned_pairs.size() > static_cast<size_t>(config.full_gmm_nbest)) {
- std::nth_element(pruned_pairs.begin(),
- pruned_pairs.end() - config.full_gmm_nbest,
- pruned_pairs.end());
- pruned_pairs.erase(pruned_pairs.begin(),
- pruned_pairs.end() - config.full_gmm_nbest);
- }
- Vector<BaseFloat> loglikes_tmp(pruned_pairs.size()); // for return value.
- KALDI_ASSERT(gselect != NULL);
- gselect->resize(pruned_pairs.size());
- // Make sure pruned Gaussians appear from best to worst.
- std::sort(pruned_pairs.begin(), pruned_pairs.end(),
- std::greater< std::pair<BaseFloat, int32> >());
- for (size_t i = 0; i < pruned_pairs.size(); i++) {
- loglikes_tmp(i) = pruned_pairs[i].first;
- (*gselect)[i] = pruned_pairs[i].second;
- }
- return loglikes_tmp.LogSumExp();
-}
-
-BaseFloat AmSgmm::GaussianSelectionPreselect(const SgmmGselectConfig &config,
- const VectorBase<BaseFloat> &data,
- const std::vector<int32> &preselect,
- std::vector<int32> *gselect) const {
- KALDI_ASSERT(IsSortedAndUniq(preselect) && !preselect.empty());
- KALDI_ASSERT(diag_ubm_.NumGauss() != 0 &&
- diag_ubm_.NumGauss() == full_ubm_.NumGauss() &&
- diag_ubm_.Dim() == data.Dim());
-
- int32 num_preselect = preselect.size();
-
- KALDI_ASSERT(config.diag_gmm_nbest > 0 && config.full_gmm_nbest > 0 &&
- config.full_gmm_nbest < num_preselect);
-
- std::vector<std::pair<BaseFloat, int32> > pruned_pairs;
- if (config.diag_gmm_nbest < num_preselect) {
- Vector<BaseFloat> loglikes(num_preselect);
- diag_ubm_.LogLikelihoodsPreselect(data, preselect, &loglikes);
- Vector<BaseFloat> loglikes_copy(loglikes);
- BaseFloat *ptr = loglikes_copy.Data();
- std::nth_element(ptr, ptr+num_preselect-config.diag_gmm_nbest,
- ptr+num_preselect);
- BaseFloat thresh = ptr[num_preselect-config.diag_gmm_nbest];
- for (int32 p = 0; p < num_preselect; p++) {
- if (loglikes(p) >= thresh) { // met threshold for diagonal phase.
- int32 g = preselect[p];
- pruned_pairs.push_back(
- std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g));
- }
- }
- } else {
- for (int32 p = 0; p < num_preselect; p++) {
- int32 g = preselect[p];
- pruned_pairs.push_back(
- std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g));
- }
- }
- KALDI_ASSERT(!pruned_pairs.empty());
- if (pruned_pairs.size() > static_cast<size_t>(config.full_gmm_nbest)) {
- std::nth_element(pruned_pairs.begin(),
- pruned_pairs.end() - config.full_gmm_nbest,
- pruned_pairs.end());
- pruned_pairs.erase(pruned_pairs.begin(),
- pruned_pairs.end() - config.full_gmm_nbest);
- }
- // Make sure pruned Gaussians appear from best to worst.
- std::sort(pruned_pairs.begin(), pruned_pairs.end(),
- std::greater<std::pair<BaseFloat, int32> >());
- Vector<BaseFloat> loglikes_tmp(pruned_pairs.size()); // for return value.
- KALDI_ASSERT(gselect != NULL);
- gselect->resize(pruned_pairs.size());
- for (size_t i = 0; i < pruned_pairs.size(); i++) {
- loglikes_tmp(i) = pruned_pairs[i].first;
- (*gselect)[i] = pruned_pairs[i].second;
- }
- return loglikes_tmp.LogSumExp();
-}
-
-
-
-void SgmmGauPost::Write(std::ostream &os, bool binary) const {
- WriteToken(os, binary, "<SgmmGauPost>");
- int32 T = this->size();
- WriteBasicType(os, binary, T);
- for (int32 t = 0; t < T; t++) {
- WriteToken(os, binary, "<gselect>");
- WriteIntegerVector(os, binary, (*this)[t].gselect);
- WriteToken(os, binary, "<tids>");
- WriteIntegerVector(os, binary, (*this)[t].tids);
- KALDI_ASSERT((*this)[t].tids.size() == (*this)[t].posteriors.size());
- for (size_t i = 0; i < (*this)[t].posteriors.size(); i++) {
- (*this)[t].posteriors[i].Write(os, binary);
- }
- }
- WriteToken(os, binary, "</SgmmGauPost>");
-}
-
-void SgmmGauPost::Read(std::istream &is, bool binary) {
- ExpectToken(is, binary, "<SgmmGauPost>");
- int32 T;
- ReadBasicType(is, binary, &T);
- KALDI_ASSERT(T >= 0);
- this->resize(T);
- for (int32 t = 0; t < T; t++) {
- ExpectToken(is, binary, "<gselect>");
- ReadIntegerVector(is, binary, &((*this)[t].gselect));
- ExpectToken(is, binary, "<tids>");
- ReadIntegerVector(is, binary, &((*this)[t].tids));
- size_t sz = (*this)[t].tids.size();
- (*this)[t].posteriors.resize(sz);
- for (size_t i = 0; i < sz; i++)
- (*this)[t].posteriors[i].Read(is, binary);
- }
- ExpectToken(is, binary, "</SgmmGauPost>");
-}
-
-
-void AmSgmmFunctions::ComputeDistances(const AmSgmm &model,
- const Vector<BaseFloat> &state_occs,
- MatrixBase<BaseFloat> *dists) {
- int32 num_states = model.NumPdfs(),
- phn_space_dim = model.PhoneSpaceDim(),
- num_gauss = model.NumGauss();
- KALDI_ASSERT(dists != NULL && dists->NumRows() == num_states
- && dists->NumCols() == num_states);
- Vector<double> prior(state_occs);
- KALDI_ASSERT(prior.Sum() != 0.0);
- prior.Scale(1.0 / prior.Sum()); // Normalize.
- SpMatrix<BaseFloat> H(phn_space_dim); // The same as H_sm in some other code.
- for (int32 i = 0; i < num_gauss; i++) {
- SpMatrix<BaseFloat> Hi(phn_space_dim);
- Hi.AddMat2Sp(1.0, model.M_[i], kTrans, model.SigmaInv_[i], 0.0);
- H.AddSp(prior(i), Hi);
- }
- bool warned = false;
- for (int32 j1 = 0; j1 < num_states; ++j1) {
- if (model.NumSubstates(j1) != 1 && !warned) {
- KALDI_WARN << "ComputeDistances() can only give meaningful output if you "
- << "have one substate per state.";
- warned = true;
- }
- for (int32 j2 = 0; j2 <= j1; ++j2) {
- Vector<BaseFloat> v_diff(model.v_[j1].Row(0));
- v_diff.AddVec(-1.0, model.v_[j2].Row(0));
- (*dists)(j1, j2) = (*dists)(j2, j1) = VecSpVec(v_diff, H, v_diff);
- }
- }
-}
-
-} // namespace kaldi
diff --git a/src/sgmm/am-sgmm.h b/src/sgmm/am-sgmm.h
--- a/src/sgmm/am-sgmm.h
+++ /dev/null
@@ -1,420 +0,0 @@
-// sgmm/am-sgmm.h
-
-// Copyright 2009-2011 Microsoft Corporation; Lukas Burget;
-// Saarland University (Author: Arnab Ghoshal);
-// Ondrej Glembek; Yanmin Qian;
-// Copyright 2012-2013 Johns Hopkins University (author: Daniel Povey)
-// Liang Lu; Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM_AM_SGMM_H_
-#define KALDI_SGMM_AM_SGMM_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "gmm/model-common.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/full-gmm.h"
-#include "itf/options-itf.h"
-#include "util/table-types.h"
-
-namespace kaldi {
-
-struct SgmmGselectConfig {
- /// Number of highest-scoring full-covariance Gaussians per frame.
- int32 full_gmm_nbest;
- /// Number of highest-scoring diagonal-covariance Gaussians per frame.
- int32 diag_gmm_nbest;
-
- SgmmGselectConfig() {
- full_gmm_nbest = 15;
- diag_gmm_nbest = 50;
- }
-
- void Register(OptionsItf *opts) {
- opts->Register("full-gmm-nbest", &full_gmm_nbest, "Number of highest-scoring"
- " full-covariance Gaussians selected per frame.");
- opts->Register("diag-gmm-nbest", &diag_gmm_nbest, "Number of highest-scoring"
- " diagonal-covariance Gaussians selected per frame.");
- }
-};
-
-/** \struct SgmmPerFrameDerivedVars
- * Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and
- * n_{i}(t) (cf. Eq. (33)-(36)) for the SGMM, as well as the cached Gaussian
- * selection records.
- */
-struct SgmmPerFrameDerivedVars {
- std::vector<int32> gselect;
- Vector<BaseFloat> xt; ///< x'(t), FMLLR-adapted, dim = [D], eq.(33)
- Matrix<BaseFloat> xti; ///< x_{i}(t) = x'(t) - o_i(s): dim = [I][D], eq.(34)
- Matrix<BaseFloat> zti; ///< z_{i}(t), dim = [I][S], eq.(35)
- Vector<BaseFloat> nti; ///< n_{i}(t), dim = [I], eq.(36)
-
- SgmmPerFrameDerivedVars() : xt(0), xti(0, 0), zti(0, 0), nti(0) {}
- void Resize(int32 ngauss, int32 feat_dim, int32 phn_dim) {
- xt.Resize(feat_dim);
- xti.Resize(ngauss, feat_dim);
- zti.Resize(ngauss, phn_dim);
- nti.Resize(ngauss);
- }
- bool IsEmpty() const {
- return (xt.Dim() == 0 || xti.NumRows() == 0 || zti.NumRows() == 0
- || nti.Dim() == 0);
- }
- bool NeedsResizing(int32 ngauss, int32 feat_dim, int32 phn_dim) const {
- /* if (xt.Dim() != feat_dim)
- KALDI_LOG << "xt dim = " << xt.Dim() << ", feat dim = " << feat_dim;
- if (xti.NumRows() != ngauss || xti.NumCols() != feat_dim)
- KALDI_LOG << "xti size = " << xti.NumRows() << ", " << xti.NumCols()
- << "; ngauss = " << ngauss << ", feat dim = " << feat_dim;
- if (zti.NumRows() != ngauss || zti.NumCols() != phn_dim)
- KALDI_LOG << "zti size = " << zti.NumRows() << ", " << zti.NumCols()
- << "; ngauss = " << ngauss << "; phn dim = " << phn_dim;
- if (nti.Dim() != ngauss)
- KALDI_LOG << "nti dim = " << nti.Dim() << ", ngauss = " << ngauss;
- */
- return (xt.Dim() != feat_dim || xti.NumRows() != ngauss
- || xti.NumCols() != feat_dim || zti.NumRows() != ngauss
- || zti.NumCols() != phn_dim || nti.Dim() != ngauss);
- }
-};
-
-
-struct SgmmPerSpkDerivedVars {
- // To set this up, call ComputePerSpkDerivedVars from the sgmm object.
- void Clear() {
- v_s.Resize(0);
- o_s.Resize(0, 0);
- }
- Vector<BaseFloat> v_s; ///< Speaker adaptation vector v_^{(s)}. Dim is [T]
- Matrix<BaseFloat> o_s; ///< Per-speaker offsets o_{i}. Dimension is [I][D]
-};
-
-
-/** \class AmSgmm
- * Class for definition of the subspace Gmm acoustic model
- */
-class AmSgmm {
- public:
- AmSgmm() {}
- void Read(std::istream &rIn, bool binary);
- void Write(std::ostream &out, bool binary,
- SgmmWriteFlagsType write_params) const;
-
- /// Checks the various components for correct sizes. With wrong sizes,
- /// assertion failure occurs. When the argument is set to true, dimensions of
- /// the various components are printed.
- void Check(bool show_properties = true);
-
- /// Initializes the SGMM parameters from a full-covariance UBM.
- void InitializeFromFullGmm(const FullGmm &gmm, int32 num_states,
- int32 phn_subspace_dim, int32 spk_subspace_dim);
-
- /// Used to copy models (useful in update)
- void CopyFromSgmm(const AmSgmm &other, bool copy_normalizers);
-
- /// Copies the global parameters from the supplied model, but sets
- /// the state vectors to zero. Supports reducing the phonetic
- /// and speaker subspace dimensions.
- void CopyGlobalsInitVecs(const AmSgmm &other, int32 phn_subspace_dim,
- int32 spk_subspace_dim, int32 num_pdfs);
-
- /// Computes the top-scoring Gaussian indices (used for pruning of later
- /// stages of computation). Returns frame log-likelihood given selected
- /// Gaussians from full UBM.
- BaseFloat GaussianSelection(const SgmmGselectConfig &config,
- const VectorBase<BaseFloat> &data,
- std::vector<int32> *gselect) const;
-
- /// As GaussianSelection, but limiting it to a provided list of
- /// preselected Gaussians (e.g. for gender dependency).
- /// The list "preselect" must be sorted and uniq.
- BaseFloat GaussianSelectionPreselect(const SgmmGselectConfig &config,
- const VectorBase<BaseFloat> &data,
- const std::vector<int32> &preselect,
- std::vector<int32> *gselect) const;
-
- /// This needs to be called with each new frame of data, prior to accumulation
- /// or likelihood evaluation: it computes various pre-computed quantities. The
- /// 'logdet_s' term is the log determinant of the FMLLR transform, or 0.0 if
- /// no FMLLR is used or it's single-class fMLLR applied in the feature
- /// extraction, and we're not keeping track of it here.
- void ComputePerFrameVars(const VectorBase<BaseFloat> &data,
- const std::vector<int32> &gselect,
- const SgmmPerSpkDerivedVars &spk_vars,
- BaseFloat logdet_s,
- SgmmPerFrameDerivedVars *per_frame_vars) const;
-
- /// Computes the per-speaker derived vars; assumes vars->v_s is already
- /// set up.
- void ComputePerSpkDerivedVars(SgmmPerSpkDerivedVars *vars) const;
-
- /// This does a likelihood computation for a given state using the
- /// top-scoring Gaussian components (in per_frame_vars). If the
- /// log_prune parameter is nonzero (e.g. 5.0), the LogSumExp() stage is
- /// pruned, which is a significant speedup... smaller values are faster.
- BaseFloat LogLikelihood(const SgmmPerFrameDerivedVars &per_frame_vars,
- int32 state_index, BaseFloat log_prune = 0.0) const;
-
- /// Similar to LogLikelihood() function above, but also computes the posterior
- /// probabilities for the top-scoring Gaussian components and all substates.
- BaseFloat ComponentPosteriors(const SgmmPerFrameDerivedVars &per_frame_vars,
- int32 state, Matrix<BaseFloat> *post) const;
-
- /// Increases the total number of substates based on the state occupancies.
- void SplitSubstates(const Vector<BaseFloat> &state_occupancies,
- int32 target_nsubstates,
- BaseFloat perturb,
- BaseFloat power,
- BaseFloat cond);
-
- /// Functions for increasing the phonetic and speaker space dimensions.
- /// The argument norm_xform is a LDA-like feature normalizing transform,
- /// computed by the ComputeFeatureNormalizer function.
- void IncreasePhoneSpaceDim(int32 target_dim,
- const Matrix<BaseFloat> &norm_xform);
- void IncreaseSpkSpaceDim(int32 target_dim,
- const Matrix<BaseFloat> &norm_xform);
-
- /// Computes (and initializes if necessary) derived vars...
- /// for now this is just the normalizers "n" and the diagonal UBM.
- void ComputeDerivedVars();
-
- /// Computes the data-independent terms in the log-likelihood computation
- /// for each Gaussian component and all substates. Eq. (31)
- void ComputeNormalizers();
-
- /// Computes the normalizers, while normalizing the weights to one
- /// among each of the sets in "normalize_sets": these sets should
- /// be disjoint and their union should be all the indices 0 ... I-1.
- void ComputeNormalizersNormalized(
- const std::vector< std::vector<int32> > &normalize_sets);
-
- /// Computes the LDA-like pre-transform and its inverse as well as the
- /// eigenvalues of the scatter of the means used in FMLLR estimation.
- void ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
- Matrix<BaseFloat> *xform,
- Matrix<BaseFloat> *inv_xform,
- Vector<BaseFloat> *diag_mean_scatter) const;
-
- /// Various model dimensions.
- int32 NumPdfs() const { return c_.size(); }
- int32 NumSubstates(int32 j) const { return c_[j].Dim(); }
- int32 NumGauss() const { return M_.size(); }
- int32 PhoneSpaceDim() const { return w_.NumCols(); }
- int32 SpkSpaceDim() const { return (N_.size() > 0) ? N_[0].NumCols() : 0; }
- int32 FeatureDim() const { return M_[0].NumRows(); }
-
- void RemoveSpeakerSpace() { N_.clear(); }
-
- /// Accessors
- const FullGmm & full_ubm() const { return full_ubm_; }
- const DiagGmm & diag_ubm() const { return diag_ubm_; }
-
- const Matrix<BaseFloat>& StateVectors(int32 state_index) const {
- return v_[state_index];
- }
- const SpMatrix<BaseFloat>& GetInvCovars(int32 gauss_index) const {
- return SigmaInv_[gauss_index];
- }
- const Matrix<BaseFloat>& GetPhoneProjection(int32 gauss_index) const {
- return M_[gauss_index];
- }
-
- /// Templated accessors (used to accumulate in different precision)
- template<typename Real>
- void GetInvCovars(int32 gauss_index, SpMatrix<Real> *out) const;
-
- template<typename Real>
- void GetSubstateMean(int32 j, int32 m, int32 i,
- VectorBase<Real> *mean_out) const;
-
- template<typename Real>
- void GetSubstateSpeakerMean(int32 state, int32 substate, int32 gauss,
- const SgmmPerSpkDerivedVars &spk,
- VectorBase<Real> *mean_out) const;
-
- template<typename Real>
- void GetVarScaledSubstateSpeakerMean(int32 state, int32 substate,
- int32 gauss,
- const SgmmPerSpkDerivedVars &spk,
- VectorBase<Real> *mean_out) const;
-
- template<typename Real>
- void GetNtransSigmaInv(std::vector< Matrix<Real> > *out) const;
-
- /// Computes quantities H = M_i Sigma_i^{-1} M_i^T.
- template<class Real>
- void ComputeH(std::vector< SpMatrix<Real> > *H_i) const;
-
- protected:
- friend class ComputeNormalizersClass;
- private:
- /// Compute a subset of normalizers; used in multi-threaded implementation.
- void ComputeNormalizersInternal(int32 num_threads, int32 thread,
- int32 *entropy_count, double *entropy_sum);
-
-
- /// Initializes the matrices M_ and w_
- void InitializeMw(int32 phn_subspace_dim,
- const Matrix<BaseFloat> &norm_xform);
- /// Initializes the matrices N_
- void InitializeN(int32 spk_subspace_dim, const Matrix<BaseFloat> &norm_xform);
- void InitializeVecs(int32 num_states); ///< Initializes the state-vectors.
- void InitializeCovars(); ///< initializes the within-class covariances.
-
- void ComputeSmoothingTermsFromModel(
- const std::vector< SpMatrix<BaseFloat> > &H,
- const Vector<BaseFloat> &state_occupancies, SpMatrix<BaseFloat> *H_sm,
- BaseFloat max_cond) const;
-
- private:
- /// These contain the "background" model associated with the subspace GMM.
- DiagGmm diag_ubm_;
- FullGmm full_ubm_;
-
- /// Globally shared parameters of the subspace GMM.
- /// The various quantities are: I = number of Gaussians, D = data dimension,
- /// S = phonetic subspace dimension, T = speaker subspace dimension,
- /// J = number of states, M_{j} = number of substates of state j.
-
- /// Inverse within-class (full) covariances; dim is [I][D][D].
- std::vector< SpMatrix<BaseFloat> > SigmaInv_;
- /// Phonetic-subspace projections. Dimension is [I][D][S]
- std::vector< Matrix<BaseFloat> > M_;
- /// Speaker-subspace projections. Dimension is [I][D][T]
- std::vector< Matrix<BaseFloat> > N_;
- /// Weight projection vectors. Dimension is [I][S]
- Matrix<BaseFloat> w_;
-
- /// The parameters in a particular SGMM state.
-
- /// v_{jm}, per-state phonetic-subspace vectors. Dimension is [J][M_{j}][S].
- std::vector< Matrix<BaseFloat> > v_;
- /// c_{jm}, mixture weights. Dimension is [J][M_{j}]
- std::vector< Vector<BaseFloat> > c_;
- /// n_{jim}, per-Gaussian normalizer. Dimension is [J][I][M_{j}]
- std::vector< Matrix<BaseFloat> > n_;
-
- // Priors for MAP adaptation of M -- keeping them here for now but they may
- // be moved somewhere else eventually
- // These are parameters of a matrix-variate normal distribution. The means are
- // the unadapted M_i, and we have 2 separate covaraince matrices for the rows
- // and columns of M.
- std::vector< Matrix<BaseFloat> > M_prior_; // Matrix-variate Gaussian mean
- SpMatrix<BaseFloat> row_cov_inv_;
- SpMatrix<BaseFloat> col_cov_inv_;
-
- KALDI_DISALLOW_COPY_AND_ASSIGN(AmSgmm);
- friend class EbwAmSgmmUpdater;
- friend class MleAmSgmmUpdater;
- friend class MleSgmmSpeakerAccs;
- friend class AmSgmmFunctions; // misc functions that need access.
- friend class MleAmSgmmUpdaterMulti;
-};
-
-template<typename Real>
-inline void AmSgmm::GetInvCovars(int32 gauss_index,
- SpMatrix<Real> *out) const {
- out->Resize(SigmaInv_[gauss_index].NumRows(), kUndefined);
- out->CopyFromSp(SigmaInv_[gauss_index]);
-}
-
-template<typename Real>
-inline void AmSgmm::GetSubstateMean(int32 j, int32 m, int32 i,
- VectorBase<Real> *mean_out) const {
- KALDI_ASSERT(mean_out != NULL);
- KALDI_ASSERT(j < NumPdfs() && m < NumSubstates(j) && i < NumGauss());
- KALDI_ASSERT(mean_out->Dim() == FeatureDim());
- Vector<BaseFloat> mean_tmp(FeatureDim());
- mean_tmp.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0);
- mean_out->CopyFromVec(mean_tmp);
-}
-
-
-template<typename Real>
-inline void AmSgmm::GetSubstateSpeakerMean(int32 j, int32 m, int32 i,
- const SgmmPerSpkDerivedVars &spk,
- VectorBase<Real> *mean_out) const {
- GetSubstateMean(j, m, i, mean_out);
- if (spk.v_s.Dim() != 0) // have speaker adaptation...
- mean_out->AddVec(1.0, spk.o_s.Row(i));
-}
-
-template<typename Real>
-void AmSgmm::GetVarScaledSubstateSpeakerMean(int32 j, int32 m, int32 i,
- const SgmmPerSpkDerivedVars &spk,
- VectorBase<Real> *mean_out) const {
- Vector<BaseFloat> tmp_mean(mean_out->Dim()), tmp_mean2(mean_out->Dim());
- GetSubstateSpeakerMean(j, m, i, spk, &tmp_mean);
- tmp_mean2.AddSpVec(1.0, SigmaInv_[i], tmp_mean, 0.0);
- mean_out->CopyFromVec(tmp_mean2);
-}
-
-
-/// Computes the inverse of an LDA transform (without dimensionality reduction)
-/// The computed transform is used in initializing the phonetic and speaker
-/// subspaces, as well as while increasing the dimensions of those spaces.
-void ComputeFeatureNormalizer(const FullGmm &gmm, Matrix<BaseFloat> *xform);
-
-
-/// This is the entry for a single time.
-struct SgmmGauPostElement {
- // Need gselect info here, since "posteriors" is relative to this set of
- // selected Gaussians.
- std::vector<int32> gselect;
- std::vector<int32> tids; // transition-ids for each entry in "posteriors"
- std::vector<Matrix<BaseFloat> > posteriors;
-};
-
-
-/// indexed by time.
-class SgmmGauPost: public std::vector<SgmmGauPostElement> {
- public:
- // Add the standard Kaldi Read and Write routines so
- // we can use KaldiObjectHolder with this type.
- explicit SgmmGauPost(size_t i) : std::vector<SgmmGauPostElement>(i) {}
- SgmmGauPost() {}
- void Write(std::ostream &os, bool binary) const;
- void Read(std::istream &is, bool binary);
-};
-
-typedef KaldiObjectHolder<SgmmGauPost> SgmmGauPostHolder;
-typedef RandomAccessTableReader<SgmmGauPostHolder> RandomAccessSgmmGauPostReader;
-typedef SequentialTableReader<SgmmGauPostHolder> SequentialSgmmGauPostReader;
-typedef TableWriter<SgmmGauPostHolder> SgmmGauPostWriter;
-
-/// Class for misc functions that need access to SGMM private variables.
-class AmSgmmFunctions {
- public:
- /// Computes matrix of approximated K-L divergences,
- /// of size [#states x #states], as described in
- /// "State-Level Data Borrowing for Low-Resource Speech Recognition based on
- /// Subspace GMMs", by Yanmin Qian et. al, Interspeech 2011.
- /// Model must have one substate per state.
- static void ComputeDistances(const AmSgmm &model,
- const Vector<BaseFloat> &state_occs,
- MatrixBase<BaseFloat> *dists);
-};
-
-} // namespace kaldi
-
-
-#endif // KALDI_SGMM_AM_SGMM_H_
diff --git a/src/sgmm/decodable-am-sgmm.cc b/src/sgmm/decodable-am-sgmm.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// sgmm/decodable-am-sgmm.cc
-
-// Copyright 2009-2011 Saarland University; Lukas Burget
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-using std::vector;
-
-#include "sgmm/decodable-am-sgmm.h"
-
-namespace kaldi {
-
-BaseFloat DecodableAmSgmm::LogLikelihoodZeroBased(int32 frame, int32 pdf_id) {
- KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
- KALDI_ASSERT(pdf_id >= 0 && pdf_id < NumIndices());
-
- if (log_like_cache_[pdf_id].hit_time == frame) {
- return log_like_cache_[pdf_id].log_like; // return cached value, if found
- }
-
- const VectorBase<BaseFloat> &data = feature_matrix_.Row(frame);
- // check if everything is in order
- if (acoustic_model_.FeatureDim() != data.Dim()) {
- KALDI_ERR << "Dim mismatch: data dim = " << data.Dim()
- << "vs. model dim = " << acoustic_model_.FeatureDim();
- }
-
- if (frame != previous_frame_) { // Per-frame precomputation for SGMM.
- if (gselect_all_.empty())
- acoustic_model_.GaussianSelection(sgmm_config_, data, &gselect_);
- else {
- KALDI_ASSERT(frame < gselect_all_.size());
- gselect_ = gselect_all_[frame];
- }
- acoustic_model_.ComputePerFrameVars(data, gselect_, spk_,
- 0.0 /*FMLLR logdet*/, &per_frame_vars_);
- previous_frame_ = frame;
- }
-
- BaseFloat loglike = acoustic_model_.LogLikelihood(per_frame_vars_, pdf_id,
- log_prune_);
- if (KALDI_ISNAN(loglike) || KALDI_ISINF(loglike))
- KALDI_ERR << "Invalid answer (overflow or invalid variances/features?)";
- log_like_cache_[pdf_id].log_like = loglike;
- log_like_cache_[pdf_id].hit_time = frame;
- return loglike;
-}
-
-void DecodableAmSgmm::ResetLogLikeCache() {
- if (log_like_cache_.size() != acoustic_model_.NumPdfs()) {
- log_like_cache_.resize(acoustic_model_.NumPdfs());
- }
- vector<LikelihoodCacheRecord>::iterator it = log_like_cache_.begin(),
- end = log_like_cache_.end();
- for (; it != end; ++it) { it->hit_time = -1; }
-}
-
-} // namespace kaldi
diff --git a/src/sgmm/decodable-am-sgmm.h b/src/sgmm/decodable-am-sgmm.h
+++ /dev/null
@@ -1,119 +0,0 @@
-// sgmm/decodable-am-sgmm.h
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation;
-// Lukas Burget
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM_DECODABLE_AM_SGMM_H_
-#define KALDI_SGMM_DECODABLE_AM_SGMM_H_
-
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "itf/decodable-itf.h"
-
-namespace kaldi {
-
-class DecodableAmSgmm : public DecodableInterface {
- public:
- DecodableAmSgmm(const SgmmGselectConfig &opts,
- const AmSgmm &am,
- const SgmmPerSpkDerivedVars &spk, // may be empty
- const TransitionModel &tm,
- const Matrix<BaseFloat> &feats,
- const std::vector<std::vector<int32> > &gselect_all,
- BaseFloat log_prune): // gselect_all may be empty
- acoustic_model_(am), sgmm_config_(opts), spk_(spk),
- trans_model_(tm), feature_matrix_(feats),
- gselect_all_(gselect_all), previous_frame_(-1),
- log_prune_(log_prune) {
- ResetLogLikeCache();
- }
-
- // Note, frames are numbered from zero, but transition indices are 1-based!
- // This is for compatibility with OpenFST.
- virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
- return LogLikelihoodZeroBased(frame, trans_model_.TransitionIdToPdf(tid));
- }
- int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
- virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
- virtual bool IsLastFrame(int32 frame) const {
- KALDI_ASSERT(frame < NumFramesReady());
- return (frame == NumFramesReady() - 1);
- }
-
- protected:
- void ResetLogLikeCache();
- virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 pdf_id);
-
- const AmSgmm &acoustic_model_;
- const SgmmGselectConfig &sgmm_config_;
- const SgmmPerSpkDerivedVars &spk_;
- const TransitionModel &trans_model_; ///< for tid to pdf mapping
- const Matrix<BaseFloat> &feature_matrix_;
- const std::vector<std::vector<int32> > gselect_all_; ///< if nonempty,
- ///< precomputed gaussian indices.
- int32 previous_frame_;
- BaseFloat log_prune_;
-
- /// Defines a cache record for a state
- struct LikelihoodCacheRecord {
- BaseFloat log_like; ///< Cache value
- int32 hit_time; ///< Frame for which this value is relevant
- };
-
- /// Cached per-frame quantities used in SGMM likelihood computation.
- std::vector<LikelihoodCacheRecord> log_like_cache_;
- std::vector<int32> gselect_;
- SgmmPerFrameDerivedVars per_frame_vars_;
-
- private:
- KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm);
-};
-
-class DecodableAmSgmmScaled : public DecodableAmSgmm {
- public:
- DecodableAmSgmmScaled(const SgmmGselectConfig &opts,
- const AmSgmm &am,
- const SgmmPerSpkDerivedVars &spk, // may be empty
- const TransitionModel &tm,
- const Matrix<BaseFloat> &feats,
- const std::vector<std::vector<int32> > &gselect_all,
- // gselect_all may be empty
- BaseFloat log_prune,
- BaseFloat scale)
- : DecodableAmSgmm(opts, am, spk, tm, feats, gselect_all, log_prune),
- scale_(scale) {}
-
- // Note, frames are numbered from zero but transition-ids from one.
- virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
- return LogLikelihoodZeroBased(frame, trans_model_.TransitionIdToPdf(tid))
- * scale_;
- }
-
- private:
- BaseFloat scale_;
- KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmmScaled);
-};
-
-
-} // namespace kaldi
-
-#endif // KALDI_SGMM_DECODABLE_AM_SGMM_H_
diff --git a/src/sgmm/estimate-am-sgmm-ebw.cc b/src/sgmm/estimate-am-sgmm-ebw.cc
+++ /dev/null
@@ -1,654 +0,0 @@
-// sgmm/estimate-am-sgmm-ebw.cc
-
-// Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "sgmm/estimate-am-sgmm-ebw.h"
-#include "thread/kaldi-thread.h"
-using std::vector;
-
-namespace kaldi {
-
-void EbwAmSgmmUpdater::Update(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- SgmmUpdateFlagsType flags,
- BaseFloat *auxf_change_out,
- BaseFloat *count_out) {
-
- KALDI_ASSERT((flags & (kSgmmPhoneVectors | kSgmmPhoneProjections |
- kSgmmPhoneWeightProjections | kSgmmCovarianceMatrix |
- kSgmmSubstateWeights | kSgmmSpeakerProjections)) != 0);
-
- // Various quantities need to be computed at the start, before we
- // change any of the model parameters.
- std::vector< SpMatrix<double> > Q_num, Q_den, H, S_means;
-
- if (flags & kSgmmPhoneProjections) {
- MleAmSgmmUpdater::ComputeQ(num_accs, *model, &Q_num);
- MleAmSgmmUpdater::ComputeQ(den_accs, *model, &Q_den);
- }
- if (flags & kSgmmCovarianceMatrix) { // compute the difference between
- // the num and den S_means matrices... this is what we will need.
- MleAmSgmmUpdater::ComputeSMeans(num_accs, *model, &S_means);
- std::vector< SpMatrix<double> > S_means_tmp;
- MleAmSgmmUpdater::ComputeSMeans(den_accs, *model, &S_means_tmp);
- for (size_t i = 0; i < S_means.size(); i++)
- S_means[i].AddSp(-1.0, S_means_tmp[i]);
- }
- if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections))
- model->ComputeH(&H);
-
- BaseFloat tot_impr = 0.0;
-
- if (flags & kSgmmPhoneVectors)
- tot_impr += UpdatePhoneVectors(num_accs, den_accs, model, H);
-
- if (flags & kSgmmPhoneProjections)
- tot_impr += UpdateM(num_accs, den_accs, Q_num, Q_den, model);
-
- if (flags & kSgmmPhoneWeightProjections)
- tot_impr += UpdateWParallel(num_accs, den_accs, model);
-
- if (flags & kSgmmCovarianceMatrix)
- tot_impr += UpdateVars(num_accs, den_accs, S_means, model);
-
- if (flags & kSgmmSubstateWeights)
- tot_impr += UpdateSubstateWeights(num_accs, den_accs, model);
-
- if (flags & kSgmmSpeakerProjections)
- tot_impr += UpdateN(num_accs, den_accs, model);
-
-
- if (auxf_change_out) *auxf_change_out = tot_impr * num_accs.total_frames_;
- if (count_out) *count_out = num_accs.total_frames_;
-
- if (fabs(num_accs.total_frames_ - den_accs.total_frames_) >
- 0.01*(num_accs.total_frames_ + den_accs.total_frames_))
- KALDI_WARN << "Num and den frame counts differ, "
- << num_accs.total_frames_ << " vs. " << den_accs.total_frames_;
-
- BaseFloat like_diff = num_accs.total_like_ - den_accs.total_like_;
-
- KALDI_LOG << "***Averaged differenced likelihood per frame is "
- << (like_diff/num_accs.total_frames_)
- << " over " << (num_accs.total_frames_) << " frames.";
- KALDI_LOG << "***Note: for this to be at all meaningful, if you use "
- << "\"canceled\" stats you will have to renormalize this over "
- << "the \"real\" frame count.";
-
- model->ComputeNormalizers();
-}
-
-
-class EbwUpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded.
- public:
- EbwUpdatePhoneVectorsClass(const EbwAmSgmmUpdater *updater,
- const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- double *auxf_impr):
- updater_(updater), num_accs_(num_accs), den_accs_(den_accs),
- model_(model), H_(H), auxf_impr_ptr_(auxf_impr), auxf_impr_(0.0) { }
-
- ~EbwUpdatePhoneVectorsClass() {
- *auxf_impr_ptr_ += auxf_impr_;
- }
-
- inline void operator() () {
- // Note: give them local copy of the sums we're computing,
- // which will be propagated to the total sums in the destructor.
- updater_->UpdatePhoneVectorsInternal(num_accs_, den_accs_, model_, H_,
- &auxf_impr_, num_threads_, thread_id_);
- }
- private:
- const EbwAmSgmmUpdater *updater_;
- const MleAmSgmmAccs &num_accs_;
- const MleAmSgmmAccs &den_accs_;
- AmSgmm *model_;
- const std::vector<SpMatrix<double> > &H_;
- double *auxf_impr_ptr_;
- double auxf_impr_;
-};
-
-
-void EbwAmSgmmUpdater::ComputePhoneVecStats(
- const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const std::vector<SpMatrix<double> > &H,
- int32 j,
- int32 m,
- const Vector<double> &w_jm,
- double gamma_jm,
- Vector<double> *g_jm,
- SpMatrix<double> *H_jm) {
- g_jm->CopyFromVec(accs.y_[j].Row(m));
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- double gamma_jmi = accs.gamma_[j](m, i);
- double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
- double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
- * VecVec(model.w_.Row(i), model.v_[j].Row(m));
- g_jm->AddVec(scalar, model.w_.Row(i));
- if (gamma_jmi != 0.0)
- H_jm->AddSp(gamma_jmi, H[i]); // The most important term..
- if (quadratic_term > 1.0e-10)
- H_jm->AddVec2(static_cast<BaseFloat>(quadratic_term), model.w_.Row(i));
- }
-}
-
-
-// Runs the phone vectors update for a subset of states (called
-// multi-threaded).
-void EbwAmSgmmUpdater::UpdatePhoneVectorsInternal(
- const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- double *auxf_impr,
- int32 num_threads,
- int32 thread_id) const {
-
- int32 block_size = (num_accs.num_states_ + (num_threads-1)) / num_threads,
- j_start = block_size * thread_id,
- j_end = std::min(num_accs.num_states_, j_start + block_size);
-
- int32 S = num_accs.phn_space_dim_, I = num_accs.num_gaussians_;
-
- for (int32 j = j_start; j < j_end; j++) {
- double num_state_count = 0.0,
- state_auxf_impr = 0.0;
- Vector<double> w_jm(I);
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- double gamma_jm_num = num_accs.gamma_[j].Row(m).Sum();
- double gamma_jm_den = den_accs.gamma_[j].Row(m).Sum();
- num_state_count += gamma_jm_num;
- Vector<double> g_jm_num(S); // computed using eq. 58 of SGMM paper [for numerator stats]
- SpMatrix<double> H_jm_num(S); // computed using eq. 59 of SGMM paper [for numerator stats]
- Vector<double> g_jm_den(S); // same, but for denominator stats.
- SpMatrix<double> H_jm_den(S);
-
- // Compute the weights for this sub-state.
- // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
- Vector<double>(model->v_[j].Row(m)), 0.0);
- w_jm.ApplySoftMax();
-
- ComputePhoneVecStats(num_accs, *model, H, j, m, w_jm, gamma_jm_num,
- &g_jm_num, &H_jm_num);
- ComputePhoneVecStats(den_accs, *model, H, j, m, w_jm, gamma_jm_den,
- &g_jm_den, &H_jm_den);
-
- Vector<double> v_jm(model->v_[j].Row(m));
- Vector<double> local_derivative(S); // difference of derivative of numerator
- // and denominator objetive function.
- local_derivative.AddVec(1.0, g_jm_num);
- local_derivative.AddSpVec(-1.0, H_jm_num, v_jm, 1.0);
- local_derivative.AddVec(-1.0, g_jm_den);
- local_derivative.AddSpVec(-1.0 * -1.0, H_jm_den, v_jm, 1.0);
-
- SpMatrix<double> quadratic_term(H_jm_num);
- quadratic_term.AddSp(1.0, H_jm_den);
- double substate_count = 1.0e-10 + gamma_jm_num + gamma_jm_den;
- quadratic_term.Scale( (substate_count + options_.tau_v) / substate_count);
- quadratic_term.Scale(1.0 / (options_.lrate_v + 1.0e-10) );
-
- Vector<double> delta_v_jm(S);
-
- SolverOptions opts;
- opts.name = "v";
- opts.K = options_.max_cond;
- opts.eps = options_.epsilon;
-
- double auxf_impr =
- ((gamma_jm_num + gamma_jm_den == 0) ? 0.0 :
- SolveQuadraticProblem(quadratic_term, local_derivative,
- opts, &delta_v_jm));
-
- v_jm.AddVec(1.0, delta_v_jm);
- model->v_[j].Row(m).CopyFromVec(v_jm);
- state_auxf_impr += auxf_impr;
- }
-
- *auxf_impr += state_auxf_impr;
- if (j < 10 && thread_id == 0) {
- KALDI_LOG << "Objf impr for state j = " << j << " is "
- << (state_auxf_impr / (num_state_count + 1.0e-10))
- << " over " << num_state_count << " frames";
- }
- }
-}
-
-double EbwAmSgmmUpdater::UpdatePhoneVectors(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- const vector< SpMatrix<double> > &H) const {
- KALDI_LOG << "Updating phone vectors.";
-
- double count = 0.0, auxf_impr = 0.0;
-
- int32 J = num_accs.num_states_;
- for (int32 j = 0; j < J; j++) count += num_accs.gamma_[j].Sum();
-
- EbwUpdatePhoneVectorsClass c(this, num_accs, den_accs, model, H, &auxf_impr);
- RunMultiThreaded(c);
-
- auxf_impr /= count;
-
- KALDI_LOG << "**Overall auxf improvement for v is " << auxf_impr
- << " over " << count << " frames";
- return auxf_impr;
-}
-
-
-double EbwAmSgmmUpdater::UpdateM(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- const std::vector< SpMatrix<double> > &Q_num,
- const std::vector< SpMatrix<double> > &Q_den,
- AmSgmm *model) const {
- int32 S = model->PhoneSpaceDim(),
- D = model->FeatureDim(),
- I = model->NumGauss();
-
- Vector<double> num_count_vec(I), den_count_vec(I), impr_vec(I);
- for (int32 j = 0; j < num_accs.num_states_; j++) {
- num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]);
- den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]);
- }
-
- for (int32 i = 0; i < I; i++) {
- double gamma_i_num = num_count_vec(i), gamma_i_den = den_count_vec(i);
-
- if (gamma_i_num + gamma_i_den == 0.0) {
- KALDI_WARN << "Not updating phonetic basis for i = " << i
- << " because count is zero. ";
- continue;
- }
-
- Matrix<double> Mi(model->M_[i]);
- Matrix<double> L(D, S); // this is something like the Y quantity, which
- // represents the linear term in the objf on M-- except that we make it the local
- // derivative about the current value, instead of the derivative around zero.
- // But it's not exactly the derivative w.r.t. M, due to the factor of Sigma_i.
- // The auxiliary function is Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T),
- // where P is Y^{-1}. The quantity L we define here will be Y - M Q,
- // and you can think of this as like the local derivative, except there is
- // a term P in there.
- L.AddMat(1.0, num_accs.Y_[i]);
- L.AddMatSp(-1.0, Mi, kNoTrans, Q_num[i], 1.0);
- L.AddMat(-1.0, den_accs.Y_[i]);
- L.AddMatSp(-1.0*-1.0, Mi, kNoTrans, Q_den[i], 1.0);
-
- SpMatrix<double> Q(S); // This is a combination of the Q's for the numerator and denominator.
- Q.AddSp(1.0, Q_num[i]);
- Q.AddSp(1.0, Q_den[i]);
-
- double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count
- // represented by the quadratic part of the stats.
- Q.Scale( (state_count + options_.tau_M) / state_count );
- Q.Scale( 1.0 / (options_.lrate_M + 1.0e-10) );
-
- SolverOptions opts;
- opts.name = "M";
- opts.K = options_.max_cond;
- opts.eps = options_.epsilon;
-
- Matrix<double> deltaM(D, S);
- double impr =
- SolveQuadraticMatrixProblem(Q, L,
- SpMatrix<double>(model->SigmaInv_[i]),
- opts, &deltaM);
-
- impr_vec(i) = impr;
- Mi.AddMat(1.0, deltaM);
- model->M_[i].CopyFromMat(Mi);
- if (i < 10 || impr / state_count > 3.0) {
- KALDI_LOG << "Objf impr for projection M for i = " << i << ", is "
- << (impr/(gamma_i_num + 1.0e-20)) << " over " << gamma_i_num
- << " frames";
- }
- }
- BaseFloat tot_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum();
-
- tot_impr /= (tot_count + 1.0e-20);
- KALDI_LOG << "Overall auxiliary function improvement for model projections "
- << "M is " << tot_impr << " over " << tot_count << " frames";
-
- KALDI_VLOG(1) << "Updating M: num-count is " << num_count_vec;
- KALDI_VLOG(1) << "Updating M: den-count is " << den_count_vec;
- KALDI_VLOG(1) << "Updating M: objf-impr is " << impr_vec;
-
- return tot_impr;
-}
-
-
-// Note: we do just one iteration of the weight-projection update here. The
-// weak-sense auxiliary functions used don't really make sense if we do it for
-// multiple iterations. It would be possible to use a similar auxiliary
-// function to the one on my (D. Povey)'s thesis for the Gaussian mixture
-// weights, which would make sense for multiple iterations, but this would be a
-// bit more complex to implement and probably would not give much improvement
-// over this approach.
-double EbwAmSgmmUpdater::UpdateWParallel(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model) {
- KALDI_LOG << "Updating weight projections";
-
- int32 I = num_accs.num_gaussians_, S = num_accs.phn_space_dim_;
-
- Matrix<double> g_i_num(I, S), g_i_den(I, S);
-
- // View F_i_{num,den} as vectors of SpMatrix [i.e. symmetric matrices,
- // linearized into vectors]
- Matrix<double> F_i_num(I, (S*(S+1))/2), F_i_den(I, (S*(S+1))/2);
-
- Vector<double> num_count_vec(I), den_count_vec(I), impr_vec(I);
- for (int32 j = 0; j < num_accs.num_states_; j++) {
- num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]);
- den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]);
- }
-
- // Get the F_i and g_i quantities-- this is done in parallel (multi-core),
- // using the same code we use in the ML update [except we get it for
- // numerator and denominator separately.]
- Matrix<double> w(model->w_);
- {
- double garbage;
- UpdateWParallelClass c_num(num_accs, *model, w, &F_i_num, &g_i_num, &garbage);
- RunMultiThreaded(c_num);
- }
- {
- double garbage;
- UpdateWParallelClass c_den(den_accs, *model, w, &F_i_den, &g_i_den, &garbage);
- RunMultiThreaded(c_den);
- }
-
- for (int32 i = 0; i < I; i++) {
-
- // auxf was originally formulated in terms of the change in w (i.e. the
- // g quantities are the local derivatives), so there is less hassle than
- // with some of the other updates, in changing it to be discriminative.
- // we essentially just difference the linear terms and add the quadratic
- // terms.
-
- Vector<double> derivative(g_i_num.Row(i));
- derivative.AddVec(-1.0, g_i_den.Row(i));
- // F_i_num quadratic_term is a bit like the negated 2nd derivative
- // of the numerator stats-- actually it's not the actual 2nd deriv,
- // but an upper bound on it.
- SpMatrix<double> quadratic_term(S), tmp_F(S);
- quadratic_term.CopyFromVec(F_i_num.Row(i));
- tmp_F.CopyFromVec(F_i_den.Row(i)); // tmp_F is used for Vector->SpMatrix conversion.
- quadratic_term.AddSp(1.0, tmp_F);
-
- double state_count = num_count_vec(i) + den_count_vec(i);
-
- quadratic_term.Scale((state_count + options_.tau_w) / (state_count + 1.0e-10));
- quadratic_term.Scale(1.0 / (options_.lrate_w + 1.0e-10) );
-
- Vector<double> delta_w(S);
-
- SolverOptions opts;
- opts.name = "w";
- opts.K = options_.max_cond;
- opts.eps = options_.epsilon;
-
- double objf_impr =
- SolveQuadraticProblem(quadratic_term, derivative, opts, &delta_w);
-
- impr_vec(i) = objf_impr;
- if (i < 10 || objf_impr / (num_count_vec(i) + 1.0e-10) > 2.0) {
- KALDI_LOG << "Predicted objf impr for w per frame is "
- << (objf_impr / (num_count_vec(i) + 1.0e-10))
- << " over " << num_count_vec(i) << " frames.";
- }
- model->w_.Row(i).AddVec(1.0, delta_w);
- }
- KALDI_VLOG(1) << "Updating w: numerator count is " << num_count_vec;
- KALDI_VLOG(1) << "Updating w: denominator count is " << den_count_vec;
- KALDI_VLOG(1) << "Updating w: objf-impr is " << impr_vec;
-
- double tot_num_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum();
- tot_impr /= tot_num_count;
-
- KALDI_LOG << "**Overall objf impr for w per frame is "
- << tot_impr << " over " << tot_num_count
- << " frames.";
- return tot_impr;
-}
-
-
-double EbwAmSgmmUpdater::UpdateN(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model) const {
- if (num_accs.spk_space_dim_ == 0 || num_accs.R_.size() == 0 ||
- num_accs.Z_.size() == 0) {
- KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
- }
-
- int32 I = num_accs.num_gaussians_, D = num_accs.feature_dim_,
- T = num_accs.spk_space_dim_;
-
- Vector<double> num_count_vec(I), den_count_vec(I), impr_vec(I);
- for (int32 j = 0; j < num_accs.num_states_; j++) {
- num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]);
- den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]);
- }
-
- for (int32 i = 0; i < I; i++) {
- double gamma_i_num = num_count_vec(i), gamma_i_den = den_count_vec(i);
- if (gamma_i_num + gamma_i_den == 0.0) {
- KALDI_WARN << "Not updating speaker basis for i = " << i
- << " because count is zero. ";
- continue;
- }
- Matrix<double> Ni(model->N_[i]);
- // See comment near declaration of L in UpdateM(). This update is the
- // same, but change M->N, Y->Z and Q->R.
-
- Matrix<double> L(D, T);
- L.AddMat(1.0, num_accs.Z_[i]);
- L.AddMatSp(-1.0, Ni, kNoTrans, num_accs.R_[i], 1.0);
- L.AddMat(-1.0, den_accs.Z_[i]);
- L.AddMatSp(-1.0*-1.0, Ni, kNoTrans, den_accs.R_[i], 1.0);
-
- SpMatrix<double> R(T); // combination of the numerator and denominator R's.
- R.AddSp(1.0, num_accs.R_[i]);
- R.AddSp(1.0, den_accs.R_[i]);
-
- double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count
- // represented by the quadratic part of the stats.
- R.Scale( (state_count + options_.tau_N) / state_count );
- R.Scale( 1.0 / (options_.lrate_N + 1.0e-10) );
-
- Matrix<double> deltaN(D, T);
-
- SolverOptions opts;
- opts.name = "M";
- opts.K = options_.max_cond;
- opts.eps = options_.epsilon;
-
- double impr =
- SolveQuadraticMatrixProblem(R, L,
- SpMatrix<double>(model->SigmaInv_[i]),
- opts, &deltaN);
-
- impr_vec(i) = impr;
- Ni.AddMat(1.0, deltaN);
- model->N_[i].CopyFromMat(Ni);
- if (i < 10 || impr / (state_count+1.0e-20) > 3.0) {
- KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
- << ", is " << (impr / (gamma_i_num + 1.0e-20)) << " over "
- << gamma_i_num << " frames";
- }
- }
-
- KALDI_VLOG(1) << "Updating N: numerator count is " << num_count_vec;
- KALDI_VLOG(1) << "Updating N: denominator count is " << den_count_vec;
- KALDI_VLOG(1) << "Updating N: objf-impr is " << impr_vec;
-
- double tot_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum();
- tot_impr /= (tot_count + 1.0e-20);
- KALDI_LOG << "**Overall auxf impr for N is " << tot_impr
- << " over " << tot_count << " frames";
- return tot_impr;
-}
-
-double EbwAmSgmmUpdater::UpdateVars(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- const std::vector< SpMatrix<double> > &S_means,
- AmSgmm *model) const {
- // Note: S_means contains not only the quantity S_means in the paper,
- // but also has a term - (Y_i M_i^T + M_i Y_i^T). Plus, it is differenced
- // between numerator and denominator. We don't calculate it here,
- // because it had to be computed with the original model, before we
- // changed the M quantities.
- int32 I = num_accs.num_gaussians_;
- KALDI_ASSERT(S_means.size() == I);
-
- Vector<double> num_count_vec(I), den_count_vec(I), impr_vec(I);
- for (int32 j = 0; j < num_accs.num_states_; j++) {
- num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]);
- den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]);
- }
-
- for (int32 i = 0; i < I; i++) {
- double num_count = num_count_vec(i), den_count = den_count_vec(i);
-
- SpMatrix<double> SigmaStats(S_means[i]);
- SigmaStats.AddSp(1.0, num_accs.S_[i]);
- SigmaStats.AddSp(-1.0, den_accs.S_[i]);
- // SigmaStats now contain the stats for estimating Sigma (as in the main SGMM paper),
- // differenced between num and den.
- SpMatrix<double> SigmaInvOld(model->SigmaInv_[i]), SigmaOld(model->SigmaInv_[i]);
- SigmaOld.Invert();
- double count = num_count - den_count;
- KALDI_ASSERT(options_.lrate_Sigma <= 1.0);
- double inv_lrate = 1.0 / options_.lrate_Sigma;
- // These formulas assure that the objective function behaves in
- // a roughly symmetric way w.r.t. num and den counts.
- double E_den = 1.0 + inv_lrate, E_num = inv_lrate - 1.0;
-
- double smoothing_count =
- (options_.tau_Sigma * inv_lrate) + // multiply tau_Sigma by inverse-lrate
- (E_den * den_count) + // for compatibility with other updates.
- (E_num * num_count) +
- 1.0e-10;
- SigmaStats.AddSp(smoothing_count, SigmaOld);
- count += smoothing_count;
- SigmaStats.Scale(1.0 / count);
- SpMatrix<double> SigmaInv(SigmaStats); // before floor and ceiling. Currently sigma,
- // not its inverse.
- bool verbose = false;
- int n_floor = SigmaInv.ApplyFloor(SigmaOld, options_.cov_min_value, verbose);
- SigmaInv.Invert(); // make it inverse variance.
- int n_ceiling = SigmaInv.ApplyFloor(SigmaInvOld, options_.cov_min_value, verbose);
-
- // this auxf_change.
- double auxf_change = -0.5 * count *(TraceSpSp(SigmaInv, SigmaStats)
- - TraceSpSp(SigmaInvOld, SigmaStats)
- - SigmaInv.LogDet()
- + SigmaInvOld.LogDet());
-
- model->SigmaInv_[i].CopyFromSp(SigmaInv);
- impr_vec(i) = auxf_change;
- if (i < 10 || auxf_change / (num_count+den_count+1.0e-10) > 2.0
- || n_floor+n_ceiling > 0) {
- KALDI_LOG << "Updating variance: Auxf change per frame for Gaussian "
- << i << " is " << (auxf_change / num_count) << " over "
- << num_count << " frames " << "(den count was " << den_count
- << "), #floor,ceil was " << n_floor << ", " << n_ceiling;
- }
- }
- KALDI_VLOG(1) << "Updating Sigma: numerator count is " << num_count_vec;
- KALDI_VLOG(1) << "Updating Sigma: denominator count is " << den_count_vec;
- KALDI_VLOG(1) << "Updating Sigma: objf-impr is " << impr_vec;
-
- double tot_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum();
- tot_impr /= tot_count+1.0e-20;
- KALDI_LOG << "**Overall auxf impr for Sigma is " << tot_impr
- << " over " << tot_count << " frames";
- return tot_impr;
-}
-
-
-double EbwAmSgmmUpdater::UpdateSubstateWeights(
- const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model) {
- KALDI_LOG << "Updating substate mixture weights";
-
- double tot_count = 0.0, tot_impr = 0.0;
- for (int32 j = 0; j < num_accs.num_states_; j++) {
- int32 M = model->NumSubstates(j);
- Vector<double> num_occs(M), den_occs(M),
- orig_weights(model->c_[j]), weights(model->c_[j]);
-
- for (int32 m = 0; m < M; m++) {
- num_occs(m) = num_accs.gamma_[j].Row(m).Sum()
- + options_.tau_c * weights(m);
- den_occs(m) = den_accs.gamma_[j].Row(m).Sum();
- }
-
- if (weights.Dim() > 1) {
- double begin_auxf = 0.0, end_auxf = 0.0;
- for (int32 m = 0; m < M; m++) { // see eq. 4.32, Dan Povey's PhD thesis.
- begin_auxf += num_occs(m) * log (weights(m))
- - den_occs(m) * weights(m) / orig_weights(m);
- }
- for (int32 iter = 0; iter < 50; iter++) {
- Vector<double> k_jm(M);
- double max_m = 0.0;
- for (int32 m = 0; m < M; m++)
- max_m = std::max(max_m, den_occs(m)/orig_weights(m));
- for (int32 m = 0; m < M; m++)
- k_jm(m) = max_m - den_occs(m)/orig_weights(m);
- for (int32 m = 0; m < M; m++)
- weights(m) = num_occs(m) + k_jm(m)*weights(m);
- weights.Scale(1.0 / weights.Sum());
- }
- for (int32 m = 0; m < M; m++)
- weights(m) = std::max(weights(m),
- static_cast<double>(options_.min_substate_weight));
- weights.Scale(1.0 / weights.Sum()); // renormalize.
-
- for (int32 m = 0; m < M; m++) {
- end_auxf += num_occs(m) * log (weights(m))
- - den_occs(m) * weights(m) / orig_weights(m);
- }
- tot_impr += end_auxf - begin_auxf;
- double this_impr = ((end_auxf - begin_auxf) / num_occs.Sum());
- if (j < 10 || this_impr > 0.5) {
- KALDI_LOG << "Updating substate weights: auxf impr for state " << j
- << " is " << this_impr << " per frame over " << num_occs.Sum()
- << " frames (den count is " << den_occs.Sum() << ")";
- }
- }
- model->c_[j].CopyFromVec(weights);
- tot_count += den_occs.Sum(); // Note: num and den occs should be the
- // same, except num occs are smoothed, so this is what we want.
- }
-
- tot_impr /= (tot_count + 1.0e-20);
-
- KALDI_LOG << "**Overall auxf impr for c is " << tot_impr
- << " over " << tot_count << " frames";
- return tot_impr;
-}
-
-} // namespace kaldi
diff --git a/src/sgmm/estimate-am-sgmm-ebw.h b/src/sgmm/estimate-am-sgmm-ebw.h
+++ /dev/null
@@ -1,217 +0,0 @@
-// sgmm/estimate-am-sgmm-ebw.h
-
-// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_
-#define KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "gmm/model-common.h"
-#include "itf/options-itf.h"
-#include "sgmm/estimate-am-sgmm.h"
-
-namespace kaldi {
-
-/**
- This header implements a form of Extended Baum-Welch training for SGMMs.
- If you are confused by this comment, see Dan Povey's thesis for an explanation of
- Extended Baum-Welch.
- A note on the EBW (Extended Baum-Welch) updates for the SGMMs... In general there is
- a parameter-specific value D that is similar to the D in EBW for GMMs. The value of
- D is generally set to:
- E * (denominator-count for that parameter) + tau-value for that parameter
- where the tau-values are user-specified parameters that are specific to the type of
- the parameter (e.g. phonetic vector, subspace projection, etc.). Things are a bit
- more complex for this update than for GMMs, because it's not just a question of picking
- a tau-value for smoothing: there is sometimes a scatter-matrix of some kind (e.g.
- an outer product of vectors, or something) that defines a quadratic objective function
- that we'll add as smoothing. We have to pick where to get this scatter-matrix from.
- We feel that it's appropriate for the "E" part of the D to get its scatter-matrix from
- denominator stats, and the tau part of the D to get half its scatter-matrix from the
- both the numerator and denominator stats, assigned a weight proportional to how much
- stats there were. When you see the auxiliary function written out, it's clear why this
- makes sense.
-
- */
-
-struct EbwAmSgmmOptions {
- BaseFloat tau_v; ///< Smoothing constant for updates of sub-state vectors v_{jm}
- BaseFloat lrate_v; ///< Learning rate used in updating v-- default 0.5
- BaseFloat tau_M; ///< Smoothing constant for the M quantities (phone-subspace projections)
- BaseFloat lrate_M; ///< Learning rate used in updating M-- default 0.5
- BaseFloat tau_N; ///< Smoothing constant for the N quantities (speaker-subspace projections)
- BaseFloat lrate_N; ///< Learning rate used in updating N-- default 0.5
- BaseFloat tau_c; ///< Tau value for smoothing substate weights (c)
- BaseFloat tau_w; ///< Tau value for smoothing update of weight projectsions (w)
- BaseFloat lrate_w; ///< Learning rate used in updating w-- default 0.5
- BaseFloat tau_Sigma; ///< Tau value for smoothing covariance-matrices Sigma.
- BaseFloat lrate_Sigma; ///< Learning rate used in updating Sigma-- default 0.5
- BaseFloat min_substate_weight; ///< Minimum allowed weight in a sub-state.
-
- BaseFloat cov_min_value; ///< E.g. 0.5-- the maximum any eigenvalue of a covariance
- /// is allowed to change. [this is the minimum; the maximum is the inverse of this,
- /// i.e. 2.0 in this case. For example, 0.9 would constrain the covariance quite tightly,
- /// 0.1 would be a loose setting.
-
- BaseFloat max_cond; ///< large value used in SolveQuadraticProblem.
- BaseFloat epsilon; ///< very small value used in SolveQuadraticProblem; workaround
- /// for an issue in some implementations of SVD.
-
- EbwAmSgmmOptions() {
- tau_v = 50.0;
- lrate_v = 0.5;
- tau_M = 500.0;
- lrate_M = 0.5;
- tau_N = 500.0;
- lrate_N = 0.5;
- tau_c = 10.0;
- tau_w = 50.0;
- lrate_w = 1.0;
- tau_Sigma = 500.0;
- lrate_Sigma = 0.5;
-
- min_substate_weight = 1.0e-05;
- cov_min_value = 0.5;
-
- max_cond = 1.0e+05;
- epsilon = 1.0e-40;
- }
-
- void Register(OptionsItf *opts) {
- std::string module = "EbwAmSgmmOptions: ";
- opts->Register("tau-v", &tau_v, module+
- "Smoothing constant for phone vector estimation.");
- opts->Register("lrate-v", &lrate_v, module+
- "Learning rate constant for phone vector estimation.");
- opts->Register("tau-m", &tau_M, module+
- "Smoothing constant for estimation of phonetic-subspace projections (M).");
- opts->Register("lrate-m", &lrate_M, module+
- "Learning rate constant for phonetic-subspace projections.");
- opts->Register("tau-n", &tau_N, module+
- "Smoothing constant for estimation of speaker-subspace projections (N).");
- opts->Register("lrate-n", &lrate_N, module+
- "Learning rate constant for speaker-subspace projections.");
- opts->Register("tau-c", &tau_c, module+
- "Smoothing constant for estimation of substate weights (c)");
- opts->Register("tau-w", &tau_w, module+
- "Smoothing constant for estimation of weight projections (w)");
- opts->Register("lrate-w", &lrate_w, module+
- "Learning rate constant for weight-projections");
- opts->Register("tau-sigma", &tau_Sigma, module+
- "Smoothing constant for estimation of within-class covariances (Sigma)");
- opts->Register("lrate-sigma", &lrate_Sigma, module+
- "Constant that controls speed of learning for variances (larger->slower)");
- opts->Register("cov-min-value", &cov_min_value, module+
- "Minimum value that an eigenvalue of the updated covariance matrix can take, "
- "relative to its old value (maximum is inverse of this.)");
- opts->Register("min-substate-weight", &min_substate_weight, module+
- "Floor for weights of sub-states.");
- opts->Register("max-cond", &max_cond, module+
- "Value used in handling singular matrices during update.");
- opts->Register("epsilon", &max_cond, module+
- "Value used in handling singular matrices during update.");
- }
-};
-
-
-/** \class EbwAmSgmmUpdater
- * Contains the functions needed to update the SGMM parameters.
- */
-class EbwAmSgmmUpdater {
- public:
- explicit EbwAmSgmmUpdater(const EbwAmSgmmOptions &options):
- options_(options) {}
-
- void Update(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- SgmmUpdateFlagsType flags,
- BaseFloat *auxf_change_out,
- BaseFloat *count_out);
-
- protected:
- // The following two classes relate to multi-core parallelization of some
- // phases of the update.
- friend class EbwUpdateWParallelClass;
- friend class EbwUpdatePhoneVectorsClass;
- private:
- EbwAmSgmmOptions options_;
-
- Vector<double> gamma_j_; ///< State occupancies
-
- double UpdatePhoneVectors(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- const std::vector< SpMatrix<double> > &H) const;
-
- // Called from UpdatePhoneVectors; updates a subset of states
- // (relates to multi-threading).
- void UpdatePhoneVectorsInternal(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- double *auxf_impr,
- int32 num_threads,
- int32 thread_id) const;
- // Called from UpdatePhoneVectorsInternal
- static void ComputePhoneVecStats(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const std::vector<SpMatrix<double> > &H,
- int32 j,
- int32 m,
- const Vector<double> &w_jm,
- double gamma_jm,
- Vector<double> *g_jm,
- SpMatrix<double> *H_jm);
-
- double UpdateM(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- const std::vector< SpMatrix<double> > &Q_num,
- const std::vector< SpMatrix<double> > &Q_den,
- AmSgmm *model) const;
-
- double UpdateN(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model) const;
-
- double UpdateVars(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- const std::vector< SpMatrix<double> > &S_means,
- AmSgmm *model) const;
-
- /// Note: in the discriminative case we do just one iteration of
- /// updating the w quantities.
- double UpdateWParallel(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model);
-
- double UpdateSubstateWeights(const MleAmSgmmAccs &num_accs,
- const MleAmSgmmAccs &den_accs,
- AmSgmm *model);
-
- KALDI_DISALLOW_COPY_AND_ASSIGN(EbwAmSgmmUpdater);
- EbwAmSgmmUpdater() {} // Prevent unconfigured updater.
-};
-
-
-} // namespace kaldi
-
-
-#endif // KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_
diff --git a/src/sgmm/estimate-am-sgmm-multi-test.cc b/src/sgmm/estimate-am-sgmm-multi-test.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// sgmm/estimate-am-sgmm-multi-test.cc
-
-// Copyright 2009-2012 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmm/model-test-common.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "sgmm/estimate-am-sgmm-multi.h"
-#include "util/kaldi-io.h"
-#include "base/kaldi-math.h"
-
-using kaldi::AmSgmm;
-using kaldi::MleAmSgmmAccs;
-using kaldi::BaseFloat;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-// Tests the MleAmSgmmUpdaterMulti (and MleAmSgmmGlobalAccs) classes.
-void TestMultiSgmmEst(const std::vector<AmSgmm*> &models,
- const std::vector< kaldi::Matrix<BaseFloat> > &feats,
- kaldi::SgmmUpdateFlagsType flags) {
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- int32 num_gauss = models[0]->NumGauss(),
- feat_dim = models[0]->FeatureDim(),
- phn_dim = models[0]->PhoneSpaceDim(),
- spk_dim = models[0]->SpkSpaceDim(),
- num_models = models.size();
- SgmmPerFrameDerivedVars frame_vars;
- SgmmPerSpkDerivedVars spk_vars;
- spk_vars.v_s.Resize(spk_dim);
- spk_vars.v_s.SetRandn();
- SgmmGselectConfig sgmm_config;
- frame_vars.Resize(num_gauss, feat_dim, phn_dim);
- sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest, num_gauss);
-
- std::vector<MleAmSgmmAccs*> accs(num_models);
- BaseFloat loglike = 0.0;
- for (int32 i = 0; i < num_models; ++i) {
- MleAmSgmmAccs* acc = new MleAmSgmmAccs(*models[i], flags);
- models[i]->ComputePerSpkDerivedVars(&spk_vars);
- for (int32 f = 0; f < feats[i].NumRows(); ++f) {
- std::vector<int32> gselect;
- models[i]->GaussianSelection(sgmm_config, feats[i].Row(f), &gselect);
- models[i]->ComputePerFrameVars(feats[i].Row(f), gselect, spk_vars, 0.0,
- &frame_vars);
- loglike += acc->Accumulate(*models[i], frame_vars, spk_vars.v_s, 0, 1.0,
- flags);
- }
- acc->CommitStatsForSpk(*models[i], spk_vars.v_s);
- accs[i] = acc;
- }
-
- std::vector<AmSgmm*> new_models(num_models);
- kaldi::MleAmSgmmOptions update_opts;
- for (int32 i = 0; i < num_models; ++i) {
- AmSgmm *sgmm1 = new AmSgmm();
- sgmm1->CopyFromSgmm(*models[i], false);
- new_models[i] = sgmm1;
- }
-
- // Updater class stores globals parameters; OK to initialize with any model
- // since it is assumed that they have the same global parameters.
- kaldi::MleAmSgmmUpdaterMulti updater(*models[0], update_opts);
- updater.Update(accs, new_models, flags);
-
- BaseFloat loglike1 = 0.0;
- for (int32 i = 0; i < num_models; ++i) {
- new_models[i]->ComputePerSpkDerivedVars(&spk_vars);
- for (int32 f = 0; f < feats[i].NumRows(); ++f) {
- std::vector<int32> gselect;
- new_models[i]->GaussianSelection(sgmm_config, feats[i].Row(f), &gselect);
- new_models[i]->ComputePerFrameVars(feats[i].Row(f), gselect, spk_vars, 0.0,
- &frame_vars);
- loglike1 += new_models[i]->LogLikelihood(frame_vars, 0);
- }
- }
- KALDI_LOG << "LL = " << loglike << "; LL1 = " << loglike1;
-
- KALDI_ASSERT(loglike1 >= loglike - (std::abs(loglike1)+std::abs(loglike))*1.0e-06);
-
- DeletePointers(&accs);
- DeletePointers(&new_models);
-}
-
-void UnitTestEstimateSgmm() {
- int32 dim = 2 + kaldi::RandInt(0, 9); // random dimension of the gmm
- int32 num_comp = 2 + kaldi::RandInt(0, 9); // random mixture size
- kaldi::FullGmm full_gmm;
- ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
- int32 num_states = 1;
- int32 num_models = kaldi::RandInt(2, 9);
- std::vector<AmSgmm*> models(num_models);
- for (int32 i =0; i < num_models; ++i) {
- AmSgmm* sgmm = new AmSgmm();
- sgmm->InitializeFromFullGmm(full_gmm, num_states, dim+1, dim);
- sgmm->ComputeNormalizers();
- models[i] = sgmm;
- }
-
- std::vector< kaldi::Matrix<BaseFloat> > feats(num_models);
- for (int32 i = 0; i < num_models; ++i) {
- // First, generate random means and variances
- int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
- kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
- vars(num_feat_comp, dim);
- for (int32 m = 0; m < num_feat_comp; ++m) {
- for (int32 d= 0; d < dim; d++) {
- means(m, d) = kaldi::RandGauss();
- vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
- }
- }
- // Now generate random features with those means and variances.
- feats[i].Resize(num_feat_comp * 200, dim);
- for (int32 m = 0; m < num_feat_comp; ++m) {
- kaldi::SubMatrix<BaseFloat> tmp(feats[i], m*200, 200, 0, dim);
- ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
- }
- }
- kaldi::SgmmUpdateFlagsType flags = kaldi::kSgmmAll;
- TestMultiSgmmEst(models, feats, flags);
- flags = (kaldi::kSgmmPhoneProjections | kaldi::kSgmmPhoneWeightProjections |
- kaldi::kSgmmCovarianceMatrix);
- TestMultiSgmmEst(models, feats, flags);
- flags = (kaldi::kSgmmSpeakerProjections | kaldi::kSgmmCovarianceMatrix |
- kaldi::kSgmmPhoneVectors);
- TestMultiSgmmEst(models, feats, flags);
- kaldi::DeletePointers(&models);
-}
-
-int main() {
- for (int i = 0; i < 10; ++i)
- UnitTestEstimateSgmm();
- std::cout << "Test OK.\n";
- return 0;
-}
diff --git a/src/sgmm/estimate-am-sgmm-multi.cc b/src/sgmm/estimate-am-sgmm-multi.cc
+++ /dev/null
@@ -1,746 +0,0 @@
-// sgmm/estimate-am-sgmm-multi.cc
-
-// Copyright 2012 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm-multi.h"
-#include "thread/kaldi-thread.h"
-
-namespace kaldi {
-
-void MleAmSgmmGlobalAccs::ResizeAccumulators(const AmSgmm &model,
- SgmmUpdateFlagsType flags) {
- num_gaussians_ = model.NumGauss();
- feature_dim_ = model.FeatureDim();
- phn_space_dim_ = model.PhoneSpaceDim();
- spk_space_dim_ = model.SpkSpaceDim();
-
- if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
- Y_.resize(num_gaussians_);
- Q_.resize(num_gaussians_);
- for (int32 i = 0; i < num_gaussians_; ++i) {
- Y_[i].Resize(feature_dim_, phn_space_dim_, kSetZero);
- Q_[i].Resize(phn_space_dim_, kSetZero);
- }
- } else {
- Y_.clear();
- Q_.clear();
- }
-
- if (flags & kSgmmCovarianceMatrix) {
- S_.resize(num_gaussians_);
- S_means_.resize(num_gaussians_);
- for (int32 i = 0; i < num_gaussians_; i++) {
- S_[i].Resize(feature_dim_, kSetZero);
- S_means_[i].Resize(feature_dim_, kSetZero);
- }
- } else {
- S_.clear();
- }
-
- if (flags & kSgmmSpeakerProjections) {
- if (spk_space_dim_ == 0) {
- KALDI_ERR << "Cannot set up accumulators for speaker projections "
- << "because speaker subspace has not been set up";
- }
- Z_.resize(num_gaussians_);
- R_.resize(num_gaussians_);
- for (int32 i = 0; i < num_gaussians_; ++i) {
- Z_[i].Resize(feature_dim_, spk_space_dim_, kSetZero);
- R_[i].Resize(spk_space_dim_, kSetZero);
- }
- } else {
- Z_.clear();
- R_.clear();
- }
-
- gamma_i_.Resize(num_gaussians_, kSetZero);
-}
-
-void MleAmSgmmGlobalAccs::ZeroAccumulators(SgmmUpdateFlagsType flags) {
- if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
- for (int32 i = 0, end = Y_.size(); i < end; ++i)
- Y_[i].SetZero();
- }
- if (flags & kSgmmCovarianceMatrix) {
- for (int32 i = 0, end = S_.size(); i < end; ++i) {
- S_[i].SetZero();
- S_means_[i].SetZero();
- }
- }
-
- if (flags & kSgmmSpeakerProjections) {
- for (int32 i = 0, end = Z_.size(); i < end; ++i) {
- Z_[i].SetZero();
- R_[i].SetZero();
- }
- }
- gamma_i_.SetZero();
-}
-
-void MleAmSgmmGlobalAccs::AddAccumulators(const AmSgmm &model,
- const MleAmSgmmAccs &accs,
- SgmmUpdateFlagsType flags) {
- total_frames_ += accs.total_frames_;
- total_like_ += accs.total_like_;
- for (int32 i = 0; i < num_gaussians_; ++i) {
- if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
- Y_[i].AddMat(1.0, accs.Y_[i], kNoTrans);
- }
- if (flags & kSgmmSpeakerProjections) {
- Z_[i].AddMat(1.0, accs.Z_[i], kNoTrans);
- R_[i].AddSp(1.0, accs.R_[i]);
- }
- if (flags & kSgmmCovarianceMatrix)
- S_[i].AddSp(1.0, accs.S_[i]);
- }
-
- // gamma_i
- for (int32 j = 0; j < model.NumPdfs(); ++j) {
- for (int32 m = 0; m < model.NumSubstates(j); ++m) {
- gamma_i_.AddVec(1.0, accs.gamma_[j].Row(m));
- }
- }
-
- // Compute the Q_i quantities (Eq. 64).
- if (flags & kSgmmPhoneProjections) {
- for (int32 i = 0; i < num_gaussians_; ++i) {
- for (int32 j = 0; j < accs.num_states_; ++j) {
- const Matrix<BaseFloat> &state_vec(model.StateVectors(j));
- for (int32 m = 0; m < model.NumSubstates(j); ++m) {
- if (accs.gamma_[j](m, i) > 0.0) {
- Q_[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j](m, i)),
- state_vec.Row(m));
- }
- }
- }
- }
- }
-
- // Compute the S_i^{(means)} quantities (Eq. 74).
- if (flags & kSgmmCovarianceMatrix) {
- Matrix<double> YM_MY(feature_dim_, feature_dim_);
- SpMatrix<double> tmp_S_means(feature_dim_);
- Vector<BaseFloat> mu_jmi(feature_dim_);
- for (int32 i = 0; i < num_gaussians_; ++i) {
- // YM_MY = - (Y_{i} M_{i}^T)
- Matrix<double> M(model.GetPhoneProjection(i));
- YM_MY.AddMatMat(-1.0, accs.Y_[i], kNoTrans, M, kTrans, 0.0);
- // Add its own transpose: YM_MY = - (Y_{i} M_{i}^T + M_{i} Y_{i}^T)
- {
- Matrix<double> M(YM_MY, kTrans);
- YM_MY.AddMat(1.0, M);
- }
- tmp_S_means.CopyFromMat(YM_MY); // Sigma_{i} = -(YM' + MY')
-
- for (int32 j = 0; j < accs.num_states_; ++j) {
- for (int32 m = 0; m < model.NumSubstates(j); ++m) {
- // Sigma_{i} += gamma_{jmi} * mu_{jmi}*mu_{jmi}^T
- model.GetSubstateMean(j, m, i, &mu_jmi);
- tmp_S_means.AddVec2(static_cast<BaseFloat>(accs.gamma_[j](m, i)), mu_jmi);
- }
- }
- S_means_[i].AddSp(1.0, tmp_S_means);
- KALDI_ASSERT(1.0 / S_means_[i](0, 0) != 0.0);
- }
- }
-}
-
-BaseFloat MleAmSgmmUpdaterMulti::UpdateGlobals(const MleAmSgmmGlobalAccs &accs,
- SgmmUpdateFlagsType flags) {
- BaseFloat tot_impr = 0.0;
- if (flags & kSgmmPhoneProjections) {
- tot_impr += UpdateM(accs);
- }
- if (flags & kSgmmCovarianceMatrix) {
- tot_impr += UpdateVars(accs);
- }
- if (flags & kSgmmSpeakerProjections) {
- tot_impr += UpdateN(accs);
- if (update_options_.renormalize_N)
- KALDI_WARN << "Not renormalizing N";
- }
-
- KALDI_LOG << "**Total auxf improvement for phone projections & covariances is "
- << (tot_impr) << " over " << accs.total_frames_ << " frames.";
- return tot_impr;
-}
-
-void MleAmSgmmUpdaterMulti::Update(const std::vector<MleAmSgmmAccs*> &accs,
- const std::vector<AmSgmm*> &models,
- SgmmUpdateFlagsType flags) {
- KALDI_ASSERT((flags & (kSgmmPhoneVectors | kSgmmPhoneProjections |
- kSgmmPhoneWeightProjections | kSgmmCovarianceMatrix |
- kSgmmSubstateWeights | kSgmmSpeakerProjections)) != 0);
- if (accs.size() != models.size()) {
- KALDI_ERR << "Found " << accs.size() << " accs and " << models.size()
- << " models. Must have same number of models and accs.";
- }
-
- SgmmUpdateFlagsType global_flags = (flags & (kSgmmPhoneProjections |
- kSgmmPhoneWeightProjections |
- kSgmmSpeakerProjections |
- kSgmmCovarianceMatrix));
- SgmmUpdateFlagsType state_spec_flags = (flags & ~global_flags);
- MleAmSgmmGlobalAccs glob_accs;
- BaseFloat tot_impr = 0.0;
- int32 num_models = models.size();
-
- std::vector< SpMatrix<double> > H;
- if (update_options_.renormalize_V)
- models[0]->ComputeH(&H);
-
- if (global_flags != 0) { // expected operating case
- glob_accs.ResizeAccumulators(*models[0], global_flags);
- for (int32 i = 0; i < num_models; ++i) {
- glob_accs.AddAccumulators(*models[i], *accs[i], global_flags);
- }
- UpdateGlobals(glob_accs, global_flags);
-
- // Weight projection needs access to all models
- if (global_flags & kSgmmPhoneWeightProjections) {
- if (update_options_.use_sequential_weight_update)
- KALDI_ERR << "Sequential weight update not implemented, using parallel";
-// tot_impr += UpdateWSequential(accs, model);
-// } else {
- tot_impr += UpdateWParallel(accs, models);
-// }
- }
- } else { // Shouldn't be using this class without updating global params
- KALDI_WARN << "Using MleAmSgmmUpdaterMulti class without updating global "
- << " parameters.";
- }
-
- // Update the state-specific parameters: phone vectors & substate weights
- if (state_spec_flags != 0) {
- MleAmSgmmOptions state_spec_opts = update_options_;
- state_spec_opts.renormalize_V = false;
- state_spec_opts.renormalize_N = false;
-
- MleAmSgmmUpdater sgmm_updater(state_spec_opts);
- for (int32 i = 0; i < num_models; ++i)
- tot_impr += sgmm_updater.Update(*accs[i], models[i], state_spec_flags);
- }
-
-
- if (update_options_.renormalize_V && (global_flags != 0)) {
- SpMatrix<double> H_sm;
- this->ComputeSmoothingTerms(glob_accs, H, &H_sm);
- RenormalizeV(H_sm, models);
- }
-
- KALDI_LOG << "**Total auxf improvement, combining all parameters, over "
- << "all model is " << tot_impr << " per frame.";
-
- // The following is just for diagnostics
- double total_frames = 0, total_like = 0;
- for (int32 i = 0; i < num_models; ++i) {
- total_frames += accs[i]->TotalFrames();
- total_like += accs[i]->TotalLike();
- }
- KALDI_LOG << "***Total data likelihood, over all models, is "
- << (total_like/total_frames) << " over " << total_frames
- << " frames.";
-
- // Now, copy the global parameters to the models
- for (int32 i = 0; i < num_models; ++i) {
- if ((flags & kSgmmPhoneProjections) || update_options_.renormalize_V)
- models[i]->M_ = global_M_;
- if (flags & kSgmmCovarianceMatrix)
- models[i]->SigmaInv_ = global_SigmaInv_;
- if ((flags & kSgmmSpeakerProjections) || update_options_.renormalize_N)
- models[i]->N_ = global_N_;
- if ((flags & kSgmmPhoneWeightProjections) || update_options_.renormalize_V)
- models[i]->w_ = global_w_;
- models[i]->ComputeNormalizers(); // So that the models are ready to use.
- }
-}
-
-// Compute H^{(sm)}, the "smoothing" matrices.
-void MleAmSgmmUpdaterMulti::ComputeSmoothingTerms(
- const MleAmSgmmGlobalAccs &accs,
- const std::vector< SpMatrix<double> > &H,
- SpMatrix<double> *H_sm) const {
- KALDI_ASSERT(H_sm != NULL);
- H_sm->Resize(PhoneSpaceDim());
-
- double sum = 0.0;
- for (int32 i = 0; i < NumGauss(); ++i) {
- if (accs.gamma_i_(i) > 0) {
- H_sm->AddSp(accs.gamma_i_(i), H[i]);
- sum += accs.gamma_i_(i);
- }
- }
-
- if (sum == 0.0) {
- KALDI_WARN << "Sum of counts is zero. Smoothing matrix set to unit";
- H_sm->SetUnit(); // arbitrary non-singular matrix
- } else {
- H_sm->Scale(1.0 / sum);
- int32 tmp = H_sm->LimitCondDouble(update_options_.max_cond_H_sm);
- if (tmp > 0) {
- KALDI_WARN << "Limited " << tmp << " eigenvalues of H_sm.";
- }
- }
-}
-
-double MleAmSgmmUpdaterMulti::UpdateM(const MleAmSgmmGlobalAccs &accs) {
- double totcount = 0.0, tot_like_impr = 0.0;
- for (int32 i = 0; i < accs.num_gaussians_; ++i) {
- if (accs.gamma_i_(i) < accs.feature_dim_) {
- KALDI_WARN << "For component " << i << ": not updating M due to very "
- << "small count (=" << accs.gamma_i_(i) << ").";
- continue;
- }
-
-
- SolverOptions opts;
- opts.name = "M";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- Matrix<double> Mi(global_M_[i]);
- double impr =
- SolveQuadraticMatrixProblem(accs.Q_[i], accs.Y_[i],
- SpMatrix<double>(global_SigmaInv_[i]),
- opts, &Mi);
- global_M_[i].CopyFromMat(Mi);
-
- if (i % 50 == 0) {
- KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is "
- << (impr/(accs.gamma_i_(i) + 1.0e-20)) << " over "
- << accs.gamma_i_(i) << " frames";
- }
- totcount += accs.gamma_i_(i);
- tot_like_impr += impr;
- }
- tot_like_impr /= (totcount + 1.0e-20);
- KALDI_LOG << "Overall objective function improvement for model projections "
- << "M is " << tot_like_impr << " over " << totcount << " frames";
- return tot_like_impr;
-}
-
-double MleAmSgmmUpdaterMulti::UpdateN(const MleAmSgmmGlobalAccs &accs) {
- double totcount = 0.0, tot_like_impr = 0.0;
- if (accs.spk_space_dim_ == 0 || accs.R_.size() == 0 || accs.Z_.size() == 0) {
- KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
- }
-
- for (int32 i = 0; i < accs.num_gaussians_; ++i) {
- if (accs.gamma_i_(i) < 2 * accs.spk_space_dim_) {
- KALDI_WARN << "Not updating speaker basis for i = " << (i)
- << " because count is too small " << (accs.gamma_i_(i));
- continue;
- }
-
- SolverOptions opts;
- opts.name = "N";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- Matrix<double> Ni(global_N_[i]);
- double impr =
- SolveQuadraticMatrixProblem(accs.R_[i], accs.Z_[i],
- SpMatrix<double>(global_SigmaInv_[i]),
- opts, &Ni);
- global_N_[i].CopyFromMat(Ni);
- if (i < 10) {
- KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
- << ", is " << (impr / (accs.gamma_i_(i) + 1.0e-20)) << " over "
- << (accs.gamma_i_(i)) << " frames";
- }
- totcount += accs.gamma_i_(i);
- tot_like_impr += impr;
- }
-
- tot_like_impr /= (totcount+1.0e-20);
- KALDI_LOG << "**Overall objf impr for N is " << tot_like_impr << " over "
- << totcount << " frames";
- return tot_like_impr;
-}
-
-
-double MleAmSgmmUpdaterMulti::UpdateVars(const MleAmSgmmGlobalAccs &accs) {
- SpMatrix<double> Sigma_i(accs.feature_dim_), Sigma_i_ml(accs.feature_dim_);
- double tot_objf_impr = 0.0, tot_t = 0.0;
- SpMatrix<double> covfloor(accs.feature_dim_);
- Vector<double> objf_improv(accs.num_gaussians_);
-
- // First pass over all (shared) Gaussian components to calculate the
- // ML estimate of the covariances, and the total covariance for flooring.
- for (int32 i = 0; i < accs.num_gaussians_; ++i) {
- // Eq. (75): Sigma_{i}^{ml} = 1/gamma_{i} [S_{i} + S_{i}^{(means)} - ...
- // Y_{i} M_{i}^T - M_{i} Y_{i}^T]
- // Note the S_means_ already contains the Y_{i} M_{i}^T terms.
- Sigma_i_ml.CopyFromSp(accs.S_means_[i]);
- Sigma_i_ml.AddSp(1.0, accs.S_[i]);
- covfloor.AddSp(1.0, Sigma_i_ml);
- // inverting small values e.g. 4.41745328e-40 seems to generate inf,
- // although would be fixed up later.
- if (accs.gamma_i_(i) > 1.0e-20) {
- Sigma_i_ml.Scale(1 / (accs.gamma_i_(i) + 1.0e-20));
- } else {
- Sigma_i_ml.SetUnit();
- }
- KALDI_ASSERT(1.0 / Sigma_i_ml(0, 0) != 0.0);
- // Eq. (76): Compute the objective function with the old parameter values
- objf_improv(i) = global_SigmaInv_[i].LogPosDefDet() -
- TraceSpSp(SpMatrix<double>(global_SigmaInv_[i]), Sigma_i_ml);
-
- global_SigmaInv_[i].CopyFromSp(Sigma_i_ml); // inverted in the next loop.
- }
-
- // Compute the covariance floor.
- if (accs.gamma_i_.Sum() == 0) { // If no count, use identity.
- KALDI_WARN << "Updating variances: zero counts. Setting floor to unit.";
- covfloor.SetUnit();
- } else { // else, use the global average covariance.
- covfloor.Scale(update_options_.cov_floor / accs.gamma_i_.Sum());
- int32 tmp;
- if ((tmp = covfloor.LimitCondDouble(update_options_.max_cond)) != 0) {
- KALDI_WARN << "Covariance flooring matrix is poorly conditioned. Fixed "
- << "up " << (tmp) << " eigenvalues.";
- }
- }
-
- if (update_options_.cov_diag_ratio > 1000) {
- KALDI_LOG << "Assuming you want to build a diagonal system since "
- << "cov_diag_ratio is large: making diagonal covFloor.";
- for (int32 i = 0; i < covfloor.NumRows(); i++)
- for (int32 j = 0; j < i; j++)
- covfloor(i, j) = 0.0;
- }
-
- // Second pass over all (shared) Gaussian components to calculate the
- // floored estimate of the covariances, and update the model.
- for (int32 i = 0; i < accs.num_gaussians_; ++i) {
- Sigma_i.CopyFromSp(global_SigmaInv_[i]);
- Sigma_i_ml.CopyFromSp(Sigma_i);
- // In case of insufficient counts, make the covariance matrix diagonal.
- // cov_diag_ratio is 2 by default, set to very large to always get diag-cov
- if (accs.gamma_i_(i) < update_options_.cov_diag_ratio * accs.feature_dim_) {
- KALDI_WARN << "For Gaussian component " << i << ": Too low count "
- << accs.gamma_i_(i) << " for covariance matrix estimation. "
- << "Setting to diagonal";
- for (int32 d = 0; d < accs.feature_dim_; d++)
- for (int32 e = 0; e < d; e++)
- Sigma_i(d, e) = 0.0; // SpMatrix, can only set lower traingular part
-
- int floored = Sigma_i.ApplyFloor(covfloor);
- if (floored > 0) {
- KALDI_WARN << "For Gaussian component " << i << ": Floored " << floored
- << " covariance eigenvalues.";
- }
- global_SigmaInv_[i].CopyFromSp(Sigma_i);
- global_SigmaInv_[i].InvertDouble();
- } else { // Updating the full covariance matrix.
- try {
- int floored = Sigma_i.ApplyFloor(covfloor);
- if (floored > 0) {
- KALDI_WARN << "For Gaussian component " << i << ": Floored "
- << floored << " covariance eigenvalues.";
- }
- global_SigmaInv_[i].CopyFromSp(Sigma_i);
- global_SigmaInv_[i].InvertDouble();
-
- objf_improv(i) += Sigma_i.LogPosDefDet() +
- TraceSpSp(SpMatrix<double>(global_SigmaInv_[i]), Sigma_i_ml);
- objf_improv(i) *= (-0.5 * accs.gamma_i_(i)); // Eq. (76)
-
- tot_objf_impr += objf_improv(i);
- tot_t += accs.gamma_i_(i);
- if (i < 5) {
- KALDI_VLOG(2) << "objf impr from variance update =" << objf_improv(i)
- / (accs.gamma_i_(i) + 1.0e-20) << " over " << (accs.gamma_i_(i))
- << " frames for i = " << (i);
- }
- } catch(...) {
- KALDI_WARN << "Updating within-class covariance matrix i = " << (i)
- << ", numerical problem";
- // This is a catch-all thing in case of unanticipated errors, but
- // flooring should prevent this occurring for the most part.
- global_SigmaInv_[i].SetUnit(); // Set to unit.
- }
- }
- }
- KALDI_LOG << "**Overall objf impr for variance update = "
- << (tot_objf_impr / (tot_t+ 1.0e-20))
- << " over " << (tot_t) << " frames";
- return tot_objf_impr / (tot_t + 1.0e-20);
-}
-
-
-// The parallel weight update, in the paper.
-double MleAmSgmmUpdaterMulti::UpdateWParallel(
- const std::vector<MleAmSgmmAccs*> &accs,
- const std::vector<AmSgmm*> &models) {
- KALDI_LOG << "Updating weight projections";
-
- int32 phn_dim = models[0]->PhoneSpaceDim(),
- num_gauss = models[0]->NumGauss(),
- num_models = models.size();
- SpMatrix<double> v_vT(phn_dim);
- // tot_like_{after, before} are totals over multiple iterations,
- // not valid likelihoods. but difference is valid (when divided by tot_count).
- double tot_predicted_like_impr = 0.0, tot_like_before = 0.0,
- tot_like_after = 0.0, tot_count = 0.0;
-
- Vector<double> w_jm(num_gauss);
- Matrix<double> g_i(num_gauss, phn_dim);
- std::vector< SpMatrix<double> > F_i(num_gauss);
-
- Matrix<double> w(global_w_);
- for (int iter = 0; iter < update_options_.weight_projections_iters; iter++) {
- for (int32 i = 0; i < num_gauss; ++i) {
- F_i[i].Resize(phn_dim, kSetZero);
- }
- double k_like_before = 0.0, k_count = 0.0;
- g_i.SetZero();
-
- // Unlike in the report the inner most loop is over Gaussians, where
- // per-gaussian statistics are accumulated. This is more memory demanding
- // but more computationally efficient, as outer product v_{jvm} v_{jvm}^T
- // is computed only once for all gaussians.
-
- for (int32 mdl_idx = 0; mdl_idx < num_models; ++mdl_idx) {
- std::vector< Matrix<double> > gamma(accs[mdl_idx]->GetOccs());
- for (int32 j = 0; j < models[mdl_idx]->NumPdfs(); j++) {
- for (int32 m = 0; m < models[mdl_idx]->NumSubstates(j); m++) {
- double gamma_jm = gamma[j].Row(m).Sum();
- k_count += gamma_jm;
-
- // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- w_jm.AddMatVec(1.0, w, kNoTrans,
- Vector<double>(models[mdl_idx]->v_[j].Row(m)), 0.0);
- w_jm.Add((-1.0) * w_jm.LogSumExp());
- k_like_before += VecVec(w_jm, gamma[j].Row(m));
- w_jm.ApplyExp();
- v_vT.SetZero();
- // v_vT := v_{jkm} v_{jkm}^T
- v_vT.AddVec2(1.0, models[mdl_idx]->v_[j].Row(m));
-
- for (int32 i = 0; i < num_gauss; i++) {
- // Suggestion: g_jkm can be computed more efficiently
- // using the Vector/Matrix routines for all i at once
- // linear term around cur value.
- double linear_term = gamma[j](m, i) - gamma_jm * w_jm(i);
- double quadratic_term = std::max(gamma[j](m, i), gamma_jm * w_jm(i));
- g_i.Row(i).AddVec(linear_term, models[mdl_idx]->v_[j].Row(m));
- // Now I am calling this F_i in the document. [dan]
- F_i[i].AddSp(quadratic_term, v_vT);
- }
- } // loop over substates
- } // loop over states
- } // loop over model/acc pairs
-
- Matrix<double> w_orig(w);
- double k_predicted_like_impr = 0.0, k_like_after = 0.0;
- double min_step = 0.001, step_size;
-
- SolverOptions opts;
- opts.name = "w";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- for (step_size = 1.0; step_size >= min_step; step_size /= 2) {
- k_predicted_like_impr = 0.0;
- k_like_after = 0.0;
-
- for (int32 i = 0; i < num_gauss; i++) {
- // auxf is formulated in terms of change in w.
- Vector<double> delta_w(phn_dim);
- // returns objf impr with step_size = 1,
- // but it may not be 1 so we recalculate it.
- SolveQuadraticProblem(F_i[i], g_i.Row(i), opts, &delta_w);
-
- delta_w.Scale(step_size);
- double predicted_impr = VecVec(delta_w, g_i.Row(i)) -
- 0.5 * VecSpVec(delta_w, F_i[i], delta_w);
-
- // should never be negative because
- // we checked inside SolveQuadraticProblem.
- KALDI_ASSERT(predicted_impr >= -1.0e-05);
-
- if (i < 10) {
- KALDI_LOG << "Predicted objf impr for w (not per frame), iter = " <<
- (iter) << ", i = " << (i) << " is " << (predicted_impr);
- }
- k_predicted_like_impr += predicted_impr;
- w.Row(i).AddVec(1.0, delta_w);
- }
-
- for (int32 mdl_idx = 0; mdl_idx < num_models; ++mdl_idx) {
- std::vector< Matrix<double> > gamma(accs[mdl_idx]->GetOccs());
- for (int32 j = 0; j < models[mdl_idx]->NumPdfs(); j++) {
- for (int32 m = 0; m < models[mdl_idx]->NumSubstates(j); m++) {
- w_jm.AddMatVec(1.0, w, kNoTrans,
- Vector<double>(models[mdl_idx]->v_[j].Row(m)), 0.0);
- w_jm.Add((-1.0) * w_jm.LogSumExp());
- k_like_after += VecVec(w_jm, gamma[j].Row(m));
- }
- }
- }
- KALDI_VLOG(2) << "For iteration " << (iter) << ", updating w gives "
- << "predicted per-frame like impr "
- << (k_predicted_like_impr / k_count) << ", actual "
- << ((k_like_after - k_like_before) / k_count) << ", over "
- << (k_count) << " frames";
- if (k_like_after < k_like_before) {
- w.CopyFromMat(w_orig); // Undo what we computed.
- if (fabs(k_like_after - k_like_before) / k_count < 1.0e-05) {
- k_like_after = k_like_before;
- KALDI_WARN << "Not updating weights as not increasing auxf and "
- << "probably due to numerical issues (since small change).";
- break;
- } else {
- KALDI_WARN << "Halving step size for weights as likelihood did "
- << "not increase";
- }
- } else {
- break;
- }
- }
- if (step_size < min_step) {
- // Undo any step as we have no confidence that this is right.
- w.CopyFromMat(w_orig);
- } else {
- if (iter == 0) {
- tot_count += k_count;
- }
- tot_predicted_like_impr += k_predicted_like_impr;
- tot_like_after += k_like_after;
- tot_like_before += k_like_before;
- }
- }
-
- global_w_.CopyFromMat(w);
-
- tot_predicted_like_impr /= tot_count;
- tot_like_after = (tot_like_after - tot_like_before) / tot_count;
- KALDI_LOG << "**Overall objf impr for w is " << tot_predicted_like_impr
- << ", actual " << tot_like_after << ", over "
- << tot_count << " frames";
- return tot_like_after;
-}
-
-void MleAmSgmmUpdaterMulti::RenormalizeV(const SpMatrix<double> &H_sm,
- const vector<AmSgmm*> &models) {
- int32 phn_dim = PhoneSpaceDim(),
- feat_dim = FeatureDim(),
- num_models = models.size();
- SpMatrix<double> Sigma(phn_dim);
- int32 count = 0;
- for (int32 mdl = 0; mdl < num_models; ++mdl) {
- for (int32 j = 0; j < models[mdl]->NumPdfs(); ++j) {
- for (int32 m = 0; m < models[mdl]->NumSubstates(j); ++m) {
- count++;
- Sigma.AddVec2(static_cast<BaseFloat>(1.0), models[mdl]->v_[j].Row(m));
- }
- }
- }
- Sigma.Scale(1.0 / count);
- int32 fixed_eigs = Sigma.LimitCondDouble(update_options_.max_cond);
- if (fixed_eigs != 0) {
- KALDI_WARN << "Scatter of vectors v is poorly conditioned. Fixed up "
- << fixed_eigs << " eigenvalues.";
- }
- KALDI_LOG << "Eigenvalues of scatter of vectors v is : ";
- Sigma.PrintEigs("Sigma");
- if (!Sigma.IsPosDef()) {
- KALDI_LOG << "Not renormalizing v because scatter is not positive definite"
- << " -- maybe first iter?";
- return;
- }
-
- // Want to make variance of v unit and H_sm (like precision matrix) diagonal.
- TpMatrix<double> L(phn_dim);
- L.Cholesky(Sigma);
- TpMatrix<double> LInv(L);
- LInv.Invert();
-
- Matrix<double> tmpL(phn_dim, phn_dim);
- tmpL.CopyFromTp(L);
-
- SpMatrix<double> H_sm_proj(phn_dim);
- H_sm_proj.AddMat2Sp(1.0, tmpL, kTrans, H_sm, 0.0);
- // H_sm_proj := L^{T} * H_sm * L.
- // This is right because we would transform the vectors themselves
- // by L^{-1}, and H_sm is like the inverse of the vectors,
- // so it's {L^{-1}}^{-T} = L^T.
-
- Matrix<double> U(phn_dim, phn_dim);
- Vector<double> eigs(phn_dim);
- H_sm_proj.SymPosSemiDefEig(&eigs, &U, 1.0); // 1.0 means no checking +ve def -> faster
- KALDI_LOG << "Note on the next diagnostic: the first number is generally not "
- << "that meaningful as it relates to the static offset";
- H_sm_proj.PrintEigs("H_sm_proj (Significance of dims in vector space.. note)");
-
- // Transform on vectors is U^T L^{-1}.
- // Why? Because transform on H_sm is T =U^T L^T
- // and we want T^{-T} by normal rules of vector/covector and we
- // have (U^T L^T)^{-T} = (L U)^{-1} = U^T L^{-1}.
- Matrix<double> Trans(phn_dim, phn_dim); // T^{-T}
- Matrix<double> tmpLInv(phn_dim, phn_dim);
- tmpLInv.CopyFromTp(LInv);
- Trans.AddMatMat(1.0, U, kTrans, tmpLInv, kNoTrans, 0.0);
- Matrix<double> TransInv(Trans);
- TransInv.Invert(); // T in above...
-
-#ifdef KALDI_PARANOID
- {
- SpMatrix<double> H_sm_tmp(phn_dim);
- H_sm_tmp.AddMat2Sp(1.0, TransInv, kTrans, H_sm, 0.0);
- KALDI_ASSERT(H_sm_tmp.IsDiagonal(0.1));
- }
- {
- SpMatrix<double> Sigma_tmp(phn_dim);
- Sigma_tmp.AddMat2Sp(1.0, Trans, kNoTrans, Sigma, 0.0);
- KALDI_ASSERT(Sigma_tmp.IsUnit(0.1));
- }
-#endif
-
- for (int32 mdl = 0; mdl < num_models; ++mdl) {
- for (int32 j = 0; j < models[mdl]->NumPdfs(); ++j) {
- for (int32 m = 0; m < models[mdl]->NumSubstates(j); ++m) {
- Vector<double> tmp(phn_dim);
- tmp.AddMatVec(1.0, Trans, kNoTrans, Vector<double>(models[mdl]->v_[j].Row(m)), 0.0);
- models[mdl]->v_[j].Row(m).CopyFromVec(tmp);
- }
- }
- }
- for (int32 i = 0; i < NumGauss(); ++i) {
- Vector<double> tmp(phn_dim);
- tmp.AddMatVec(1.0, TransInv, kTrans, Vector<double>(global_w_.Row(i)), 0.0);
- global_w_.Row(i).CopyFromVec(tmp);
-
- Matrix<double> tmpM(feat_dim, phn_dim);
- // Multiplying on right not left so must not transpose TransInv.
- tmpM.AddMatMat(1.0, Matrix<double>(global_M_[i]), kNoTrans,
- TransInv, kNoTrans, 0.0);
- global_M_[i].CopyFromMat(tmpM);
- }
- KALDI_LOG << "Renormalized subspace.";
-}
-
-} // namespace kaldi
diff --git a/src/sgmm/estimate-am-sgmm-multi.h b/src/sgmm/estimate-am-sgmm-multi.h
+++ /dev/null
@@ -1,146 +0,0 @@
-// sgmm/estimate-am-sgmm-multi.h
-
-// Copyright 2012 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM_ESTIMATE_AM_SGMM_MULTI_H_
-#define KALDI_SGMM_ESTIMATE_AM_SGMM_MULTI_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "gmm/model-common.h"
-
-namespace kaldi {
-
-/** \class MleAmSgmmGlobalAccs
- * Class for the accumulators associated with SGMM global parameters (e.g.
- * phonetic-, weight- and speaker-projections; and covariances). This is
- * used when the global parameters are updated using stats from multiple
- * models.
- */
-class MleAmSgmmGlobalAccs {
- public:
- explicit MleAmSgmmGlobalAccs()
- : feature_dim_(0), phn_space_dim_(0), spk_space_dim_(0),
- num_gaussians_(0), total_frames_(0.0), total_like_(0.0) {}
-
- /// Resizes the accumulators to the correct sizes given the model. The flags
- /// argument control which accumulators to resize.
- void ResizeAccumulators(const AmSgmm &model, SgmmUpdateFlagsType flags);
-
- /// Set the accumulators specified by the flags argument to zero.
- void ZeroAccumulators(SgmmUpdateFlagsType flags);
-
- /// Add another accumulator object
- void AddAccumulators(const AmSgmm &model, const MleAmSgmmAccs &acc,
- SgmmUpdateFlagsType flags);
-
- int32 FeatureDim() const { return feature_dim_; }
- int32 PhoneSpaceDim() const { return phn_space_dim_; }
- int32 NumGauss() const { return num_gaussians_; }
-
- private:
- /// The stats which are not tied to any state.
- /// Stats Y_{i} for phonetic-subspace projections M; Dim is [I][D][S].
- std::vector< Matrix<double> > Y_;
- /// Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T].
- std::vector< Matrix<double> > Z_;
- /// R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T]
- std::vector< SpMatrix<double> > R_;
- /// S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D].
- std::vector< SpMatrix<double> > S_;
- /// Total occupancies gamma_i for each Gaussian. Dim is [I]
- Vector<double> gamma_i_;
-
- /// Q_{i}, quadratic term for phonetic subspace estimation. Dim is [I][S][S]
- std::vector< SpMatrix<double> > Q_;
- /// Eq (74): S_{i}^{(means)}, scatter of substate mean vectors for estimating
- /// the shared covariance matrices. Dimension is [I][D][D].
- std::vector< SpMatrix<double> > S_means_;
-
- /// Dimensionality of various subspaces
- int32 feature_dim_, phn_space_dim_, spk_space_dim_;
- int32 num_gaussians_; ///< Other model specifications
-
- double total_frames_, total_like_;
-
- KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmGlobalAccs);
- friend class MleAmSgmmUpdaterMulti;
-};
-
-
-/** \class MleAmSgmmUpdaterMulti
- * Contains the functions needed to update the parameters for multiple SGMMs
- * whose global parameters are tied.
- */
-class MleAmSgmmUpdaterMulti {
- public:
- explicit MleAmSgmmUpdaterMulti(const AmSgmm &model,
- const MleAmSgmmOptions &options)
- : update_options_(options), global_SigmaInv_(model.SigmaInv_),
- global_M_(model.M_), global_N_(model.N_), global_w_(model.w_) {}
-
- void Update(const std::vector<MleAmSgmmAccs*> &accs,
- const std::vector<AmSgmm*> &models,
- SgmmUpdateFlagsType flags);
-
- /// Various model dimensions.
- int32 NumGauss() const { return global_M_.size(); }
- int32 PhoneSpaceDim() const { return global_w_.NumCols(); }
- int32 SpkSpaceDim() const {
- return (global_N_.size() > 0) ? global_N_[0].NumCols() : 0;
- }
- int32 FeatureDim() const { return global_M_[0].NumRows(); }
-
- private:
- MleAmSgmmOptions update_options_;
-
- /// SGMM global parameters that will be updated together and copied to the
- /// different models:
- std::vector< SpMatrix<BaseFloat> > global_SigmaInv_;
- std::vector< Matrix<BaseFloat> > global_M_;
- std::vector< Matrix<BaseFloat> > global_N_;
- Matrix<BaseFloat> global_w_;
-
- BaseFloat UpdateGlobals(const MleAmSgmmGlobalAccs &glob_accs,
- SgmmUpdateFlagsType flags);
-
- double UpdateM(const MleAmSgmmGlobalAccs &accs);
- double UpdateN(const MleAmSgmmGlobalAccs &accs);
- double UpdateVars(const MleAmSgmmGlobalAccs &accs);
- double UpdateWParallel(const std::vector<MleAmSgmmAccs*> &accs,
- const std::vector<AmSgmm*> &models);
-// double UpdateWSequential(const std::vector<MleAmSgmmAccs*> &accs,
-// const std::vector<AmSgmm*> &models);
-
- void ComputeSmoothingTerms(const MleAmSgmmGlobalAccs &accs,
- const std::vector<SpMatrix<double> > &H,
- SpMatrix<double> *H_sm) const;
- void RenormalizeV(const SpMatrix<double> &H_sm,
- const std::vector<AmSgmm*> &models);
-
- KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmUpdaterMulti);
- MleAmSgmmUpdaterMulti() {} // Prevent unconfigured updater.
-};
-
-} // namespace kaldi
-
-
-#endif // KALDI_SGMM_ESTIMATE_AM_SGMM_MULTI_H_
diff --git a/src/sgmm/estimate-am-sgmm-test.cc b/src/sgmm/estimate-am-sgmm-test.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-// sgmm/estimate-am-sgmm-test.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmm/model-test-common.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "util/kaldi-io.h"
-#include "base/kaldi-math.h"
-
-using kaldi::AmSgmm;
-using kaldi::MleAmSgmmAccs;
-using kaldi::BaseFloat;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-// Tests the Read() and Write() methods for the accumulators, in both binary
-// and ASCII mode, as well as Check().
-void TestUpdateAndAccsIO(const AmSgmm &sgmm,
- const kaldi::Matrix<BaseFloat> &feats) {
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- kaldi::SgmmUpdateFlagsType flags = kaldi::kSgmmAll;
- kaldi::SgmmPerFrameDerivedVars frame_vars;
- kaldi::SgmmPerSpkDerivedVars empty;
- frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(),
- sgmm.PhoneSpaceDim());
- kaldi::SgmmGselectConfig sgmm_config;
- sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
- sgmm.NumGauss());
- MleAmSgmmAccs accs(sgmm, flags);
- BaseFloat loglike = 0.0;
- Vector<BaseFloat> empty_spk;
- for (int32 i = 0; i < feats.NumRows(); i++) {
- std::vector<int32> gselect;
- sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
- sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, 0.0, &frame_vars);
- loglike += accs.Accumulate(sgmm, frame_vars, empty_spk, 0, 1.0, flags);
- }
- accs.CommitStatsForSpk(sgmm, empty_spk);
-
- kaldi::MleAmSgmmOptions update_opts;
- update_opts.check_v = (Rand()%2 == 0);
- AmSgmm *sgmm1 = new AmSgmm();
- sgmm1->CopyFromSgmm(sgmm, false);
- kaldi::MleAmSgmmUpdater updater(update_opts);
- updater.Update(accs, sgmm1, flags);
- std::vector<int32> gselect;
-
- sgmm1->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
- sgmm1->ComputePerFrameVars(feats.Row(0), gselect, empty, 0.0, &frame_vars);
- BaseFloat loglike1 = sgmm1->LogLikelihood(frame_vars, 0);
- delete sgmm1;
-
- // First, non-binary write
- accs.Write(kaldi::Output("tmpf", false).Stream(), false);
- bool binary_in;
- MleAmSgmmAccs *accs1 = new MleAmSgmmAccs();
- // Non-binary read
- kaldi::Input ki1("tmpf", &binary_in);
- accs1->Read(ki1.Stream(), binary_in, false);
- accs1->Check(sgmm, true);
- AmSgmm *sgmm2 = new AmSgmm();
- sgmm2->CopyFromSgmm(sgmm, false);
- updater.Update(*accs1, sgmm2, flags);
-
- sgmm2->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
- sgmm2->ComputePerFrameVars(feats.Row(0), gselect, empty, 0.0, &frame_vars);
- BaseFloat loglike2 = sgmm2->LogLikelihood(frame_vars, 0);
- kaldi::AssertEqual(loglike1, loglike2, 1e-4);
- delete accs1;
-
- // Next, binary write
- accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
- MleAmSgmmAccs *accs2 = new MleAmSgmmAccs();
- // Binary read
- kaldi::Input ki2("tmpfb", &binary_in);
- accs2->Read(ki2.Stream(), binary_in, false);
- accs2->Check(sgmm, true);
- AmSgmm *sgmm3 = new AmSgmm();
- sgmm3->CopyFromSgmm(sgmm, false);
- updater.Update(*accs2, sgmm3, flags);
- sgmm3->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
- sgmm3->ComputePerFrameVars(feats.Row(0), gselect, empty, 0.0, &frame_vars);
- BaseFloat loglike3 = sgmm3->LogLikelihood(frame_vars, 0);
- kaldi::AssertEqual(loglike1, loglike3, 1e-6);
-
- // Testing the MAP update of M
- update_opts.tau_map_M = 100;
- update_opts.full_col_cov = (RandUniform() > 0.5)? true : false;
- update_opts.full_row_cov = (RandUniform() > 0.5)? true : false;
- kaldi::MleAmSgmmUpdater updater_map(update_opts);
- BaseFloat impr = updater_map.Update(*accs2, sgmm3, flags);
- KALDI_ASSERT(impr >= 0);
-
- delete accs2;
- delete sgmm2;
- delete sgmm3;
-
- unlink("tmpf");
- unlink("tmpfb");
-}
-
-void UnitTestEstimateSgmm() {
- int32 dim = 1 + kaldi::RandInt(0, 9); // random dimension of the gmm
- int32 num_comp = 2 + kaldi::RandInt(0, 9); // random mixture size
- kaldi::FullGmm full_gmm;
- ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
- int32 num_states = 1;
- AmSgmm sgmm;
- kaldi::SgmmGselectConfig config;
- sgmm.InitializeFromFullGmm(full_gmm, num_states, dim+1, dim);
- sgmm.ComputeNormalizers();
-
- kaldi::Matrix<BaseFloat> feats;
-
- { // First, generate random means and variances
- int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
- kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
- vars(num_feat_comp, dim);
- for (int32 m = 0; m < num_feat_comp; m++) {
- for (int32 d= 0; d < dim; d++) {
- means(m, d) = kaldi::RandGauss();
- vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
- }
- }
- // Now generate random features with those means and variances.
- feats.Resize(num_feat_comp * 200, dim);
- for (int32 m = 0; m < num_feat_comp; m++) {
- kaldi::SubMatrix<BaseFloat> tmp(feats, m*200, 200, 0, dim);
- ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
- }
- }
- TestUpdateAndAccsIO(sgmm, feats);
-}
-
-int main() {
- for (int i = 0; i < 10; i++)
- UnitTestEstimateSgmm();
- std::cout << "Test OK.\n";
- return 0;
-}
diff --git a/src/sgmm/estimate-am-sgmm.cc b/src/sgmm/estimate-am-sgmm.cc
+++ /dev/null
@@ -1,2135 +0,0 @@
-// sgmm/estimate-am-sgmm.cc
-
-// Copyright 2009-2011 Microsoft Corporation; Lukas Burget;
-// Saarland University (Author: Arnab Ghoshal);
-// Ondrej Glembek; Yanmin Qian;
-// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey)
-// Liang Lu; Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "thread/kaldi-thread.h"
-
-namespace kaldi {
-using std::string;
-using std::vector;
-
-void MleAmSgmmAccs::Write(std::ostream &out_stream, bool binary) const {
- uint32 tmp_uint32;
-
- WriteToken(out_stream, binary, "<SGMMACCS>");
-
- WriteToken(out_stream, binary, "<NUMSTATES>");
- tmp_uint32 = static_cast<uint32>(num_states_);
- WriteBasicType(out_stream, binary, tmp_uint32);
- WriteToken(out_stream, binary, "<NUMGaussians>");
- tmp_uint32 = static_cast<uint32>(num_gaussians_);
- WriteBasicType(out_stream, binary, tmp_uint32);
- WriteToken(out_stream, binary, "<FEATUREDIM>");
- tmp_uint32 = static_cast<uint32>(feature_dim_);
- WriteBasicType(out_stream, binary, tmp_uint32);
- WriteToken(out_stream, binary, "<PHONESPACEDIM>");
- tmp_uint32 = static_cast<uint32>(phn_space_dim_);
- WriteBasicType(out_stream, binary, tmp_uint32);
- WriteToken(out_stream, binary, "<SPKSPACEDIM>");
- tmp_uint32 = static_cast<uint32>(spk_space_dim_);
- WriteBasicType(out_stream, binary, tmp_uint32);
- if (!binary) out_stream << "\n";
-
- if (Y_.size() != 0) {
- KALDI_ASSERT(gamma_.size() != 0);
- WriteToken(out_stream, binary, "<Y>");
- for (int32 i = 0; i < num_gaussians_; i++) {
- Y_[i].Write(out_stream, binary);
- }
- }
- if (Z_.size() != 0) {
- KALDI_ASSERT(R_.size() != 0);
- WriteToken(out_stream, binary, "<Z>");
- for (int32 i = 0; i < num_gaussians_; i++) {
- Z_[i].Write(out_stream, binary);
- }
- WriteToken(out_stream, binary, "<R>");
- for (int32 i = 0; i < num_gaussians_; i++) {
- R_[i].Write(out_stream, binary);
- }
- }
- if (S_.size() != 0) {
- KALDI_ASSERT(gamma_.size() != 0);
- WriteToken(out_stream, binary, "<S>");
- for (int32 i = 0; i < num_gaussians_; i++) {
- S_[i].Write(out_stream, binary);
- }
- }
- if (y_.size() != 0) {
- KALDI_ASSERT(gamma_.size() != 0);
- WriteToken(out_stream, binary, "<y>");
- for (int32 j = 0; j < num_states_; j++) {
- y_[j].Write(out_stream, binary);
- }
- }
- if (gamma_.size() != 0) {
- WriteToken(out_stream, binary, "<gamma>");
- for (int32 j = 0; j < num_states_; j++) {
- gamma_[j].Write(out_stream, binary);
- }
- }
- WriteToken(out_stream, binary, "<total_like>");
- WriteBasicType(out_stream, binary, total_like_);
-
- WriteToken(out_stream, binary, "<total_frames>");
- WriteBasicType(out_stream, binary, total_frames_);
-
- WriteToken(out_stream, binary, "</SGMMACCS>");
-}
-
-void MleAmSgmmAccs::Read(std::istream &in_stream, bool binary,
- bool add) {
- uint32 tmp_uint32;
- string token;
-
- ExpectToken(in_stream, binary, "<SGMMACCS>");
-
- ExpectToken(in_stream, binary, "<NUMSTATES>");
- ReadBasicType(in_stream, binary, &tmp_uint32);
- num_states_ = static_cast<int32>(tmp_uint32);
- ExpectToken(in_stream, binary, "<NUMGaussians>");
- ReadBasicType(in_stream, binary, &tmp_uint32);
- num_gaussians_ = static_cast<int32>(tmp_uint32);
- ExpectToken(in_stream, binary, "<FEATUREDIM>");
- ReadBasicType(in_stream, binary, &tmp_uint32);
- feature_dim_ = static_cast<int32>(tmp_uint32);
- ExpectToken(in_stream, binary, "<PHONESPACEDIM>");
- ReadBasicType(in_stream, binary, &tmp_uint32);
- phn_space_dim_ = static_cast<int32>(tmp_uint32);
- ExpectToken(in_stream, binary, "<SPKSPACEDIM>");
- ReadBasicType(in_stream, binary, &tmp_uint32);
- spk_space_dim_ = static_cast<int32>(tmp_uint32);
-
- ReadToken(in_stream, binary, &token);
-
- while (token != "</SGMMACCS>") {
- if (token == "<Y>") {
- Y_.resize(num_gaussians_);
- for (size_t i = 0; i < Y_.size(); i++) {
- Y_[i].Read(in_stream, binary, add);
- }
- } else if (token == "<Z>") {
- Z_.resize(num_gaussians_);
- for (size_t i = 0; i < Z_.size(); i++) {
- Z_[i].Read(in_stream, binary, add);
- }
- } else if (token == "<R>") {
- R_.resize(num_gaussians_);
- if (gamma_s_.Dim() == 0) gamma_s_.Resize(num_gaussians_);
- for (size_t i = 0; i < R_.size(); i++) {
- R_[i].Read(in_stream, binary, add);
- }
- } else if (token == "<S>") {
- S_.resize(num_gaussians_);
- for (size_t i = 0; i < S_.size(); i++) {
- S_[i].Read(in_stream, binary, add);
- }
- } else if (token == "<y>") {
- y_.resize(num_states_);
- for (int32 j = 0; j < num_states_; j++) {
- y_[j].Read(in_stream, binary, add);
- }
- } else if (token == "<gamma>") {
- gamma_.resize(num_states_);
- for (int32 j = 0; j < num_states_; j++) {
- gamma_[j].Read(in_stream, binary, add);
- }
- // Don't read gamma_s, it's just a temporary variable and
- // not part of the permanent (non-speaker-specific) accs.
- } else if (token == "<total_like>") {
- double total_like;
- ReadBasicType(in_stream, binary, &total_like);
- if (add)
- total_like_ += total_like;
- else
- total_like_ = total_like;
- } else if (token == "<total_frames>") {
- double total_frames;
- ReadBasicType(in_stream, binary, &total_frames);
- if (add)
- total_frames_ += total_frames;
- else
- total_frames_ = total_frames;
- } else {
- KALDI_ERR << "Unexpected token '" << token << "' in model file ";
- }
- ReadToken(in_stream, binary, &token);
- }
-}
-
-void MleAmSgmmAccs::Check(const AmSgmm &model,
- bool show_properties) const {
- if (show_properties) {
- KALDI_LOG << "SgmmPdfModel: J = " << num_states_ << ", D = " <<
- feature_dim_ << ", S = " << phn_space_dim_ << ", T = " <<
- spk_space_dim_ << ", I = " << num_gaussians_;
- }
- KALDI_ASSERT(num_states_ == model.NumPdfs() && num_states_ > 0);
- KALDI_ASSERT(num_gaussians_ == model.NumGauss() && num_gaussians_ > 0);
- KALDI_ASSERT(feature_dim_ == model.FeatureDim() && feature_dim_ > 0);
- KALDI_ASSERT(phn_space_dim_ == model.PhoneSpaceDim() && phn_space_dim_ > 0);
- KALDI_ASSERT(spk_space_dim_ == model.SpkSpaceDim());
-
- std::ostringstream debug_str;
-
- if (Y_.size() == 0) {
- debug_str << "Y: no. ";
- } else {
- KALDI_ASSERT(gamma_.size() != 0);
- KALDI_ASSERT(Y_.size() == static_cast<size_t>(num_gaussians_));
- bool nz = false;
- for (int32 i = 0; i < num_gaussians_; i++) {
- KALDI_ASSERT(Y_[i].NumRows() == feature_dim_ &&
- Y_[i].NumCols() == phn_space_dim_);
- if (!nz && Y_[i](0, 0) != 0) { nz = true; }
- }
- debug_str << "Y: yes, " << string(nz ? "nonzero. " : "zero. ");
- }
-
- if (Z_.size() == 0) {
- KALDI_ASSERT(R_.size() == 0);
- debug_str << "Z, R: no. ";
- } else {
- KALDI_ASSERT(gamma_s_.Dim() == num_gaussians_);
- KALDI_ASSERT(Z_.size() == static_cast<size_t>(num_gaussians_));
- KALDI_ASSERT(R_.size() == static_cast<size_t>(num_gaussians_));
- bool Z_nz = false, R_nz = false;
- for (int32 i = 0; i < num_gaussians_; i++) {
- KALDI_ASSERT(Z_[i].NumRows() == feature_dim_ &&
- Z_[i].NumCols() == spk_space_dim_);
- KALDI_ASSERT(R_[i].NumRows() == spk_space_dim_);
- if (!Z_nz && Z_[i](0, 0) != 0) { Z_nz = true; }
- if (!R_nz && R_[i](0, 0) != 0) { R_nz = true; }
- }
- bool gamma_s_nz = !gamma_s_.IsZero();
- debug_str << "Z: yes, " << string(Z_nz ? "nonzero. " : "zero. ");
- debug_str << "R: yes, " << string(R_nz ? "nonzero. " : "zero. ");
- debug_str << "gamma_s: yes, " << string(gamma_s_nz ? "nonzero. " : "zero. ");
- }
-
- if (S_.size() == 0) {
- debug_str << "S: no. ";
- } else {
- KALDI_ASSERT(gamma_.size() != 0);
- bool S_nz = false;
- KALDI_ASSERT(S_.size() == static_cast<size_t>(num_gaussians_));
- for (int32 i = 0; i < num_gaussians_; i++) {
- KALDI_ASSERT(S_[i].NumRows() == feature_dim_);
- if (!S_nz && S_[i](0, 0) != 0) { S_nz = true; }
- }
- debug_str << "S: yes, " << string(S_nz ? "nonzero. " : "zero. ");
- }
-
- if (y_.size() == 0) {
- debug_str << "y: no. ";
- } else {
- KALDI_ASSERT(gamma_.size() != 0);
- bool nz = false;
- KALDI_ASSERT(y_.size() == static_cast<size_t>(num_states_));
- for (int32 j = 0; j < num_states_; j++) {
- KALDI_ASSERT(y_[j].NumRows() == model.NumSubstates(j));
- KALDI_ASSERT(y_[j].NumCols() == phn_space_dim_);
- if (!nz && y_[j](0, 0) != 0) { nz = true; }
- }
- debug_str << "y: yes, " << string(nz ? "nonzero. " : "zero. ");
- }
-
- if (gamma_.size() == 0) {
- debug_str << "gamma: no. ";
- } else {
- debug_str << "gamma: yes. ";
- bool nz = false;
- KALDI_ASSERT(gamma_.size() == static_cast<size_t>(num_states_));
- for (int32 j = 0; j < num_states_; j++) {
- KALDI_ASSERT(gamma_[j].NumRows() == model.NumSubstates(j) &&
- gamma_[j].NumCols() == num_gaussians_);
- // Just test the first substate for nonzero, else it would take too long.
- if (!nz && gamma_[j].Row(0).Norm(1.0) != 0) { nz = true; }
- }
- debug_str << "gamma: yes, " << string(nz ? "nonzero. " : "zero. ");
- }
-
- if (show_properties)
- KALDI_LOG << "Subspace GMM model properties: " << debug_str.str();
-}
-
-void MleAmSgmmAccs::ResizeAccumulators(const AmSgmm &model,
- SgmmUpdateFlagsType flags) {
- num_states_ = model.NumPdfs();
- num_gaussians_ = model.NumGauss();
- feature_dim_ = model.FeatureDim();
- phn_space_dim_ = model.PhoneSpaceDim();
- spk_space_dim_ = model.SpkSpaceDim();
-
- if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
- Y_.resize(num_gaussians_);
- for (int32 i = 0; i < num_gaussians_; i++) {
- Y_[i].Resize(feature_dim_, phn_space_dim_);
- }
- } else {
- Y_.clear();
- }
-
- if (flags & kSgmmSpeakerProjections) {
- if (spk_space_dim_ == 0) {
- KALDI_ERR << "Cannot set up accumulators for speaker projections "
- << "because speaker subspace has not been set up";
- }
- gamma_s_.Resize(num_gaussians_);
- Z_.resize(num_gaussians_);
- R_.resize(num_gaussians_);
- for (int32 i = 0; i < num_gaussians_; i++) {
- Z_[i].Resize(feature_dim_, spk_space_dim_);
- R_[i].Resize(spk_space_dim_);
- }
- } else {
- gamma_s_.Resize(0);
- Z_.clear();
- R_.clear();
- }
-
- if (flags & kSgmmCovarianceMatrix) {
- S_.resize(num_gaussians_);
- for (int32 i = 0; i < num_gaussians_; i++) {
- S_[i].Resize(feature_dim_);
- }
- } else {
- S_.clear();
- }
-
- if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections |
- kSgmmCovarianceMatrix | kSgmmSubstateWeights |
- kSgmmPhoneProjections)) {
- gamma_.resize(num_states_);
- total_frames_ = total_like_ = 0;
- for (int32 j = 0; j < num_states_; j++) {
- gamma_[j].Resize(model.NumSubstates(j), num_gaussians_);
- }
- } else {
- gamma_.clear();
- total_frames_ = total_like_ = 0;
- }
-
- if (flags & kSgmmPhoneVectors) {
- y_.resize(num_states_);
- for (int32 j = 0; j < num_states_; j++) {
- y_[j].Resize(model.NumSubstates(j), phn_space_dim_);
- }
- } else {
- y_.clear();
- }
-}
-
-BaseFloat MleAmSgmmAccs::Accumulate(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- const VectorBase<BaseFloat> &v_s, // may be empty
- int32 j, BaseFloat weight,
- SgmmUpdateFlagsType flags) {
- // Calculate Gaussian posteriors and collect statistics
- Matrix<BaseFloat> posteriors;
- BaseFloat log_like = model.ComponentPosteriors(frame_vars, j, &posteriors);
- posteriors.Scale(weight);
- BaseFloat count = AccumulateFromPosteriors(model, frame_vars, posteriors,
- v_s, j, flags);
- // Note: total_frames_ is incremented in AccumulateFromPosteriors().
- total_like_ += count * log_like;
- return log_like;
-}
-
-
-BaseFloat MleAmSgmmAccs::AccumulateFromPosteriors(
- const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- const Matrix<BaseFloat> &posteriors,
- const VectorBase<BaseFloat> &v_s, // may be empty
- int32 j,
- SgmmUpdateFlagsType flags) {
- double tot_count = 0.0;
- const vector<int32> &gselect = frame_vars.gselect;
- // Intermediate variables
- Vector<BaseFloat> gammat(gselect.size());
- Vector<BaseFloat> xt_jmi(feature_dim_), mu_jmi(feature_dim_),
- zt_jmi(spk_space_dim_);
-
- int32 num_substates = model.NumSubstates(j);
- for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
- int32 i = gselect[ki];
-
- for (int32 m = 0; m < num_substates; m++) {
- // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t)
- BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_);
-
- // Accumulate statistics for non-zero gaussian posterior
- if (gammat_jmi != 0.0) {
- tot_count += gammat_jmi;
- if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections |
- kSgmmCovarianceMatrix | kSgmmSubstateWeights |
- kSgmmPhoneProjections)) {
- // Eq. (40): gamma_{jmi} = \sum_t gamma_{jmi}(t)
- gamma_[j](m, i) += gammat_jmi;
- }
-
- if (flags & kSgmmPhoneVectors) {
- // Eq. (41): y_{jm} = \sum_{t, i} \gamma_{jmi}(t) z_{i}(t)
- // Suggestion: move this out of the loop over m
- y_[j].Row(m).AddVec(gammat_jmi, frame_vars.zti.Row(ki));
- }
-
- if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) {
- // Eq. (42): Y_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) v_{jm}^T
- Y_[i].AddVecVec(gammat_jmi, frame_vars.xti.Row(ki),
- model.StateVectors(j).Row(m));
- }
-
- if (flags & kSgmmCovarianceMatrix)
- gammat(ki) += gammat_jmi;
-
- // Accumulate for speaker projections
- if (flags & kSgmmSpeakerProjections) {
- KALDI_ASSERT(spk_space_dim_ > 0);
- // Eq. (43): x_{jmi}(t) = x_k(t) - M{i} v_{jm}
- model.GetSubstateMean(j, m, i, &mu_jmi);
- xt_jmi.CopyFromVec(frame_vars.xt);
- xt_jmi.AddVec(-1.0, mu_jmi);
- // Eq. (44): Z_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{jmi}(t) v^{s}'
- if (v_s.Dim() != 0) // interpret empty v_s as zero.
- Z_[i].AddVecVec(gammat_jmi, xt_jmi, v_s);
- // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi}
- // Will be used when you call CommitStatsForSpk(), to update R_.
- gamma_s_(i) += gammat_jmi;
- }
- } // non-zero posteriors
- } // loop over substates
- } // loop over selected Gaussians
-
- if (flags & kSgmmCovarianceMatrix) {
- for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
- int32 i = gselect[ki];
- // Eq. (47): S_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) x_{i}(t)^T
- if (gammat(ki) != 0.0)
- S_[i].AddVec2(gammat(ki), frame_vars.xti.Row(ki));
- }
- }
- total_frames_ += tot_count;
- return tot_count;
-}
-
-void MleAmSgmmAccs::CommitStatsForSpk(const AmSgmm &model,
- const VectorBase<BaseFloat> &v_s) {
- if (v_s.Dim() != 0 && spk_space_dim_ > 0 && gamma_s_.Dim() != 0) {
- if (!v_s.IsZero())
- for (int32 i = 0; i < num_gaussians_; i++)
- // Accumulate Statistics R_{ki}
- if (gamma_s_(i) != 0.0)
- R_[i].AddVec2(static_cast<BaseFloat>(gamma_s_(i)), v_s);
- }
- gamma_s_.SetZero();
-}
-
-void MleAmSgmmAccs::GetStateOccupancies(Vector<BaseFloat> *occs) const {
- occs->Resize(gamma_.size());
- for (int32 j = 0, end = gamma_.size(); j < end; j++) {
- (*occs)(j) = gamma_[j].Sum();
- }
-}
-
-BaseFloat MleAmSgmmUpdater::Update(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- SgmmUpdateFlagsType flags) {
- KALDI_ASSERT((flags & (kSgmmPhoneVectors | kSgmmPhoneProjections |
- kSgmmPhoneWeightProjections | kSgmmCovarianceMatrix |
- kSgmmSubstateWeights | kSgmmSpeakerProjections)) != 0);
-
- if (flags & kSgmmPhoneProjections)
- ComputeQ(accs, *model, &Q_);
- if (flags & kSgmmCovarianceMatrix)
- ComputeSMeans(accs, *model, &S_means_);
-
- // quantities used in both vector and weights updates...
- vector< SpMatrix<double> > H;
- // "smoothing" matrices, weighted sums of above.
- SpMatrix<double> H_sm;
- Vector<double> y_sm; // "smoothing" vectors
- if ((flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections))
- || update_options_.renormalize_V) {
- model->ComputeH(&H);
- ComputeSmoothingTerms(accs, *model, H, &H_sm,
- (flags & kSgmmPhoneVectors) ? &y_sm : NULL);
- }
-
- BaseFloat tot_impr = 0.0;
-
- if (flags & kSgmmPhoneVectors) {
- if (update_options_.check_v) {
- KALDI_ASSERT(update_options_.tau_vec == 0 &&
- "You cannot combine the check-v and tau-vec options.");
- tot_impr += UpdatePhoneVectorsChecked(accs, model, H);
- } else {
- tot_impr += UpdatePhoneVectors(accs, model, H, H_sm, y_sm);
- }
- }
- if (flags & kSgmmPhoneProjections) {
- if (update_options_.tau_map_M > 0.0)
- tot_impr += MapUpdateM(accs, model); // MAP adaptation of M
- else
- tot_impr += UpdateM(accs, model);
- }
-
- if (flags & kSgmmPhoneWeightProjections) {
- if (update_options_.use_sequential_weight_update) {
- tot_impr += UpdateWSequential(accs, model);
- } else {
- tot_impr += UpdateWParallel(accs, model);
- }
- }
- if (flags & kSgmmCovarianceMatrix)
- tot_impr += UpdateVars(accs, model);
- if (flags & kSgmmSubstateWeights)
- tot_impr += UpdateSubstateWeights(accs, model);
- if (flags & kSgmmSpeakerProjections) {
- tot_impr += UpdateN(accs, model);
- if (update_options_.renormalize_N)
- RenormalizeN(accs, model); // if you renormalize N you have to
- // alter any speaker vectors you're keeping around, as well.
- }
-
- if (update_options_.renormalize_V)
- RenormalizeV(accs, model, H_sm);
-
- KALDI_LOG << "*Overall auxf improvement, combining all parameters, is "
- << tot_impr;
-
- KALDI_LOG << "***Overall data likelihood is "
- << (accs.total_like_/accs.total_frames_)
- << " over " << (accs.total_frames_) << " frames.";
-
- model->ComputeNormalizers(); // So that the model is ready to use.
- return tot_impr;
-}
-
-// Compute the Q_{i} (Eq. 64)
-void MleAmSgmmUpdater::ComputeQ(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- std::vector< SpMatrix<double> > *Q) {
- Q->resize(accs.num_gaussians_);
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- (*Q)[i].Resize(accs.phn_space_dim_);
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model.NumSubstates(j); m++) {
- if (accs.gamma_[j](m, i) > 0.0) {
- (*Q)[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j](m, i)),
- model.v_[j].Row(m));
- }
- }
- }
- }
-}
-
-// Compute the S_i^{(means)} quantities (Eq. 74).
-// Note: we seem to have also included in this variable
-// the term - (Y_i M_I^T + M_i Y_i^T).
-void MleAmSgmmUpdater::ComputeSMeans(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- std::vector< SpMatrix<double> > *S_means) {
- S_means->resize(accs.num_gaussians_);
- Matrix<double> YM_MY(accs.feature_dim_, accs.feature_dim_);
- Vector<BaseFloat> mu_jmi(accs.feature_dim_);
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- // YM_MY = - (Y_{i} M_{i}^T)
- YM_MY.AddMatMat(-1.0, accs.Y_[i], kNoTrans,
- Matrix<double>(model.M_[i]), kTrans, 0.0);
- // Add its own transpose: YM_MY = - (Y_{i} M_{i}^T + M_{i} Y_{i}^T)
- {
- Matrix<double> M(YM_MY, kTrans);
- YM_MY.AddMat(1.0, M);
- }
- (*S_means)[i].Resize(accs.feature_dim_, kUndefined);
- (*S_means)[i].CopyFromMat(YM_MY); // Sigma_{i} = -(YM' + MY')
-
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model.NumSubstates(j); m++) {
- if (accs.gamma_[j](m, i) != 0.0) {
- // Sigma_{i} += gamma_{jmi} * mu_{jmi}*mu_{jmi}^T
- mu_jmi.AddMatVec(1.0, model.M_[i], kNoTrans, model.v_[j].Row(m), 0.0);
- (*S_means)[i].AddVec2(static_cast<BaseFloat>(accs.gamma_[j](m, i)), mu_jmi);
- }
- }
- }
- KALDI_ASSERT(1.0 / (*S_means)[i](0, 0) != 0.0);
- }
-}
-
-// Compute H^{(sm)}, the "smoothing" matrices.
-void MleAmSgmmUpdater::ComputeSmoothingTerms(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const vector<SpMatrix<double> > &H,
- SpMatrix<double> *H_sm,
- Vector<double> *y_sm) const {
- KALDI_ASSERT(H_sm != NULL);
- H_sm->Resize(accs.phn_space_dim_);
- if (y_sm != NULL) y_sm->Resize(accs.phn_space_dim_);
- Vector<double> gamma_i(accs.num_gaussians_);
-
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0, end = model.NumSubstates(j); m < end; m++) {
- gamma_i.AddVec(1.0, accs.gamma_[j].Row(m));
- if (y_sm != NULL) (*y_sm).AddVec(1.0, accs.y_[j].Row(m));
- }
- }
-
- double sum = 0.0;
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- if (gamma_i(i) > 0) {
- H_sm->AddSp(gamma_i(i), H[i]);
- sum += gamma_i(i);
- }
- }
-
- if (sum == 0.0) {
- KALDI_WARN << "Sum of counts is zero. Smoothing matrix set to unit"
- << string((y_sm != NULL)? " & smoothing vector set to 0." : ".");
- H_sm->SetUnit(); // arbitrary non-singular matrix
- } else {
- if (y_sm != NULL) {
- (*y_sm).Scale(1.0 / sum);
- KALDI_VLOG(3) << "y_sm is " << (*y_sm);
- }
- H_sm->Scale(1.0 / sum);
- Matrix<double> H_sm_old(*H_sm);
- int32 tmp = H_sm->LimitCondDouble(update_options_.max_cond_H_sm);
- if (tmp > 0) {
- KALDI_WARN << "Limited " << tmp << " eigenvalues of H_sm.";
- if (update_options_.fixup_H_sm && y_sm != NULL) {
- Vector<double> avgVec(accs.phn_space_dim_);
- SpMatrix<double> HInv(H_sm_old);
- HInv.Invert();
- avgVec.AddSpVec(1.0, HInv, (*y_sm), 0.0);
- (*y_sm).AddSpVec(1.0, (*H_sm), avgVec, 0.0);
- KALDI_VLOG(3) << "y_sm [fixed up] is " << (*y_sm);
- }
- }
- }
-}
-
-
-class UpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded.
- public:
- UpdatePhoneVectorsClass(const MleAmSgmmUpdater &updater,
- const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- const SpMatrix<double> &H_sm,
- const Vector<double> &y_sm,
- double *auxf_impr,
- double *like_impr):
- updater_(updater), accs_(accs), model_(model),
- H_(H), H_sm_(H_sm), y_sm_(y_sm), auxf_impr_ptr_(auxf_impr),
- auxf_impr_(0.0), like_impr_ptr_(like_impr), like_impr_(0.0) { }
-
- ~UpdatePhoneVectorsClass() {
- *auxf_impr_ptr_ += auxf_impr_;
- *like_impr_ptr_ += like_impr_;
- }
-
- inline void operator() () {
- // Note: give them local copy of the sums we're computing,
- // which will be propagated to the total sums in the destructor.
- updater_.UpdatePhoneVectorsInternal(accs_, model_, H_, H_sm_, y_sm_,
- &auxf_impr_, &like_impr_,
- num_threads_, thread_id_);
- }
- private:
- const MleAmSgmmUpdater &updater_;
- const MleAmSgmmAccs &accs_;
- AmSgmm *model_;
- const std::vector<SpMatrix<double> > &H_;
- const SpMatrix<double> &H_sm_;
- const Vector<double> &y_sm_;
- double *auxf_impr_ptr_;
- double auxf_impr_;
- double *like_impr_ptr_;
- double like_impr_;
-};
-
-
-// Runs the phone vectors update for a subset of states (called
-// multi-threaded).
-void MleAmSgmmUpdater::UpdatePhoneVectorsInternal(
- const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- const SpMatrix<double> &H_sm,
- const Vector<double> &y_sm,
- double *auxf_impr,
- double *like_impr,
- int32 num_threads,
- int32 thread_id) const {
-
- int32 block_size = (accs.num_states_ + (num_threads-1)) / num_threads,
- j_start = block_size * thread_id,
- j_end = std::min(accs.num_states_, j_start + block_size);
-
- for (int32 j = j_start; j < j_end; j++) {
- double state_count = 0.0, state_auxf_impr = 0.0, state_like_impr = 0.0;
- Vector<double> w_jm(accs.num_gaussians_);
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- double gamma_jm = accs.gamma_[j].Row(m).Sum();
- state_count += gamma_jm;
- Vector<double> g_jm(accs.phn_space_dim_); // computed using eq. 58
- SpMatrix<double> H_jm(accs.phn_space_dim_); // computed using eq. 59
- // First compute normal H_jm.
-
- // need weights for this ...
- // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
- Vector<double>(model->v_[j].Row(m)), 0.0);
- w_jm.ApplySoftMax();
- g_jm.CopyFromVec(accs.y_[j].Row(m));
-
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- double gamma_jmi = accs.gamma_[j](m, i);
- double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
- double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
- * VecVec(model->w_.Row(i), model->v_[j].Row(m));
- g_jm.AddVec(scalar, model->w_.Row(i));
- if (gamma_jmi != 0.0) {
- H_jm.AddSp(gamma_jmi, H[i]); // The most important term..
- }
- if (quadratic_term > 1.0e-10) {
- H_jm.AddVec2(static_cast<BaseFloat>(quadratic_term), model->w_.Row(i));
- }
- }
- SpMatrix<double> H_jm_dash(H_jm); // with ad-hoc smoothing term.
- Vector<double> g_jm_dash(g_jm); // with ad-hoc smoothing term.
-
- // H_jm_dash = H_jm + (smoothing term)
- H_jm_dash.AddSp(update_options_.tau_vec, H_sm);
- // g_jm_dash.BlasGemv(update_options_.mTauVec, H_sm, kNoTrans, e_1, 1.0);
- // g_jm_dash = g_jm + (smoothing term)
- g_jm_dash.AddVec(update_options_.tau_vec, y_sm);
-
- // if (gamma_jm == 0) continue;
- // no, we still want to update even with zero count.
-#ifdef KALDI_PARANOID
- if (update_options_.tau_vec > 0)
- KALDI_ASSERT(H_jm_dash.IsPosDef());
-#endif
- Vector<double> vhat_jm(model->v_[j].Row(m));
- SolverOptions opts;
- opts.name = "v";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
- double objf_impr_with_prior =
- SolveQuadraticProblem(H_jm_dash,
- g_jm_dash,
- opts,
- &vhat_jm);
-
- SpMatrix<BaseFloat> H_jm_flt(H_jm);
-
- double objf_impr_noprior =
- (VecVec(vhat_jm, g_jm)
- - 0.5 * VecSpVec(vhat_jm, H_jm, vhat_jm))
- - (VecVec(model->v_[j].Row(m), g_jm)
- - 0.5 * VecSpVec(model->v_[j].Row(m), H_jm_flt, model->v_[j].Row(m)));
- model->v_[j].Row(m).CopyFromVec(vhat_jm);
- if (j < 3 && m < 2 && thread_id == 0) {
- KALDI_LOG << "Objf impr for j = " << (j) << " m = " << (m) << " is "
- << (objf_impr_with_prior / (gamma_jm + 1.0e-20))
- << " (with ad-hoc prior) "
- << (objf_impr_noprior / (gamma_jm + 1.0e-20))
- << " (no prior) over " << (gamma_jm) << " frames";
- }
- state_auxf_impr += objf_impr_with_prior;
- state_like_impr += objf_impr_noprior;
- }
-
- *auxf_impr += state_auxf_impr;
- *like_impr += state_like_impr;
- if (j < 10 && thread_id == 0) {
- KALDI_LOG << "Objf impr for state j = " << (j) << " is "
- << (state_auxf_impr / (state_count + 1.0e-20))
- << " (with ad-hoc prior) "
- << (state_like_impr / (state_count + 1.0e-20))
- << " (no prior) over " << (state_count) << " frames";
- }
- }
-}
-
-double MleAmSgmmUpdater::UpdatePhoneVectors(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const vector< SpMatrix<double> > &H,
- const SpMatrix<double> &H_sm,
- const Vector<double> &y_sm) {
- KALDI_LOG << "Updating phone vectors";
-
- double count = 0.0, auxf_impr = 0.0, like_impr = 0.0; // sum over all states
-
- for (int32 j = 0; j < accs.num_states_; j++) count += accs.gamma_[j].Sum();
-
- UpdatePhoneVectorsClass c(*this, accs, model, H, H_sm, y_sm,
- &auxf_impr, &like_impr);
- RunMultiThreaded(c);
-
- auxf_impr /= (count + 1.0e-20);
- like_impr /= (count + 1.0e-20);
- KALDI_LOG << "**Overall objf impr for v is " << auxf_impr
- << "(with ad-hoc prior) " << like_impr << " (no prior) over "
- << (count) << " frames";
- // Choosing to return actual likelihood impr here.
- return like_impr;
-}
-
-
-/**
- This is as UpdatePhoneVectors but does not support smoothing terms or
- parallelization. However, it does compute the auxiliary function
- after doing the update, and backtracks if it did not increase (due
- to the weight terms, increase is not mathematically guaranteed). */
-
-double MleAmSgmmUpdater::UpdatePhoneVectorsChecked(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const vector< SpMatrix<double> > &H) {
- KALDI_LOG << "Updating phone vectors (and checking auxiliary function)";
-
- double tot_count = 0.0, tot_objf_impr = 0.0, tot_auxf_impr = 0.0; // sum over all states
-
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- double gamma_jm = accs.gamma_[j].Row(m).Sum();
- SpMatrix<double> X_jm(accs.phn_space_dim_); // = \sum_i \gamma_{jmi} H_i
-
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- double gamma_jmi = accs.gamma_[j](m, i);
- if (gamma_jmi != 0.0)
- X_jm.AddSp(gamma_jmi, H[i]);
- }
-
- Vector<double> v_jm_orig(model->v_[j].Row(m)),
- v_jm(v_jm_orig);
-
- double exact_objf_start = 0.0, exact_objf = 0.0, auxf_impr = 0.0;
- int32 backtrack_iter, max_backtrack = 10;
- for (backtrack_iter = 0; backtrack_iter < max_backtrack; backtrack_iter++) {
- // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- Vector<double> w_jm(accs.num_gaussians_);
- w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
- v_jm, 0.0);
- w_jm.Add(-w_jm.LogSumExp()); // it is now log w_jm
-
- exact_objf = VecVec(w_jm, accs.gamma_[j].Row(m))
- + VecVec(v_jm, accs.y_[j].Row(m))
- -0.5 * VecSpVec(v_jm, X_jm, v_jm);
-
- if (backtrack_iter == 0.0) {
- exact_objf_start = exact_objf;
- } else {
- if (exact_objf >= exact_objf_start) {
- break; // terminate backtracking.
- } else {
- KALDI_LOG << "Backtracking computation of v_jm for j = " << j
- << " and m = " << m << " because objf changed by "
- << (exact_objf-exact_objf_start) << " [vs. predicted:] "
- << auxf_impr;
- v_jm.AddVec(1.0, v_jm_orig);
- v_jm.Scale(0.5);
- }
- }
-
- if (backtrack_iter == 0) { // computing updated value.
- w_jm.ApplyExp(); // it is now w_jm
- SpMatrix<double> H_jm(X_jm);
- Vector<double> g_jm(accs.y_[j].Row(m));
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- double gamma_jmi = accs.gamma_[j](m, i);
- double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i));
- double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term
- * VecVec(model->w_.Row(i), model->v_[j].Row(m));
- g_jm.AddVec(scalar, model->w_.Row(i));
- if (quadratic_term > 1.0e-10) {
- H_jm.AddVec2(static_cast<BaseFloat>(quadratic_term), model->w_.Row(i));
- }
- }
- SolverOptions opts;
- opts.name = "v";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
- auxf_impr = SolveQuadraticProblem(H_jm, g_jm, opts, &v_jm);
- }
- }
- double objf_impr = exact_objf - exact_objf_start;
- tot_count += gamma_jm;
- tot_objf_impr += objf_impr;
- tot_auxf_impr += auxf_impr;
- if (backtrack_iter == max_backtrack) {
- KALDI_WARN << "Backtracked " << max_backtrack << " times [not updating]";
- } else {
- model->v_[j].Row(m).CopyFromVec(v_jm);
- }
-
- if (j < 3 && m < 2) {
- KALDI_LOG << "Objf impr for j = " << (j) << " m = " << (m) << " is "
- << objf_impr << " vs. quadratic auxf impr (before backtrack) "
- << auxf_impr;
- }
- }
- }
-
- tot_objf_impr /= (tot_count + 1.0e-20);
- tot_auxf_impr /= (tot_count + 1.0e-20);
- KALDI_LOG << "**Overall objf impr for v is " << tot_objf_impr
- << " (auxf impr before backtracking:) " << tot_auxf_impr
- << " over " << tot_count << " frames";
- // Choosing to return actual likelihood impr here.
- return tot_objf_impr;
-}
-
-
-
-class UpdatePhoneVectorsCheckedFromClusterableClass: public MultiThreadable { // For multi-threaded.
- public:
- UpdatePhoneVectorsCheckedFromClusterableClass(
- MleAmSgmmUpdater *updater,
- const std::vector<SgmmClusterable*> &stats,
- const std::vector<SpMatrix<double> > &H,
- AmSgmm *model,
- double *count,
- double *like_impr):
- updater_(updater), stats_(stats), H_(H), model_(model),
- count_ptr_(count), count_(0.0),
- like_impr_ptr_(like_impr), like_impr_(0.0)
- { }
-
- ~UpdatePhoneVectorsCheckedFromClusterableClass() {
- *count_ptr_ += count_;
- *like_impr_ptr_ += like_impr_;
- }
-
- inline void operator() () {
- // Note: give them local copy of the sums we're computing,
- // which will be propagated to the total sums in the destructor.
- updater_->UpdatePhoneVectorsCheckedFromClusterableInternal(
- stats_, H_, model_, &count_, &like_impr_, num_threads_, thread_id_);
- }
- private:
- MleAmSgmmUpdater *updater_;
- const std::vector<SgmmClusterable*> &stats_;
- const std::vector<SpMatrix<double> > &H_;
- AmSgmm *model_;
- double *count_ptr_;
- double count_;
- double *like_impr_ptr_;
- double like_impr_;
-};
-
-
-double MleAmSgmmUpdater::UpdatePhoneVectorsCheckedFromClusterable(
- const std::vector<SgmmClusterable*> &stats,
- const vector< SpMatrix<double> > &H,
- AmSgmm *model) {
- KALDI_LOG << "Updating phone vectors using stats from Clusterable class "
- "(and checking auxiliary function)";
- double count = 0.0, like_impr = 0.0;
-
- UpdatePhoneVectorsCheckedFromClusterableClass c(this, stats, H, model,
- &count, &like_impr);
- RunMultiThreaded(c);
-
- KALDI_LOG << "**Overall objf impr for v is " << (like_impr / count)
- << " over " << count << " frames.";
-
- return like_impr / count;
-}
-
-
-void MleAmSgmmUpdater::UpdatePhoneVectorsCheckedFromClusterableInternal(
- const std::vector<SgmmClusterable*> &stats,
- const vector< SpMatrix<double> > &H,
- AmSgmm *model,
- double *count_ptr,
- double *like_impr_ptr,
- int32 num_threads,
- int32 thread_id) {
-
- int32 block_size = (model->NumPdfs() + (num_threads-1)) / num_threads,
- j_start = block_size * thread_id,
- j_end = std::min(model->NumPdfs(), j_start + block_size);
-
- double tot_count = 0.0, tot_objf_impr = 0.0, tot_auxf_impr = 0.0; // sum over all states
-
- KALDI_ASSERT(model->NumPdfs() == static_cast<int32>(stats.size()));
- int32 num_gauss = model->NumGauss();
- for (int32 j = j_start; j < j_end; j++) {
- KALDI_ASSERT(model->NumSubstates(j) == 1 &&
- "This function only works if there is 1 substate per state.");
- int32 m = 0; // sub-state index.
- const Vector<double> &gamma = stats[j]->gamma();
- const Vector<double> &y = stats[j]->y();
-
- double gamma_jm = gamma.Sum();
- SpMatrix<double> X_jm(model->PhoneSpaceDim()); // = \sum_i \gamma_{jmi} H_i
-
- for (int32 i = 0; i < num_gauss; i++) {
- double gamma_jmi = gamma(i);
- if (gamma_jmi != 0.0)
- X_jm.AddSp(gamma_jmi, H[i]);
- }
-
- Vector<double> v_jm_orig(model->v_[j].Row(m)),
- v_jm(v_jm_orig);
-
- double exact_objf_start = 0.0, exact_objf = 0.0, auxf_impr = 0.0;
- int32 backtrack_iter, max_backtrack = 10;
- for (backtrack_iter = 0; backtrack_iter < max_backtrack; backtrack_iter++) {
- // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- Vector<double> w_jm(num_gauss);
- w_jm.AddMatVec(1.0, Matrix<double>(model->w_), kNoTrans,
- v_jm, 0.0);
- w_jm.Add(-w_jm.LogSumExp()); // it is now log w_jm
-
- exact_objf = VecVec(w_jm, gamma)
- + VecVec(v_jm, y)
- -0.5 * VecSpVec(v_jm, X_jm, v_jm);
-
- if (backtrack_iter == 0.0) {
- exact_objf_start = exact_objf;
- } else {
- if (exact_objf >= exact_objf_start) {
- break; // terminate backtracking.
- } else {
- KALDI_LOG << "Backtracking computation of v_jm for j = " << j
- << " and m = " << m << " because objf changed by "
- << (exact_objf-exact_objf_start) << " [vs. predicted:] "
- << auxf_impr;
- v_jm.AddVec(1.0, v_jm_orig);
- v_jm.Scale(0.5);
- }
- }
-
- if (backtrack_iter == 0) { // computing updated value.
- w_jm.ApplyExp(); // it is now w_jm
- SpMatrix<double> weight_2nd_deriv(model->PhoneSpaceDim()); // actually
- // negatived 2nd derivative.
- Vector<double> num_deriv(model->PhoneSpaceDim());
- Vector<double> den_deriv(model->PhoneSpaceDim());
-
- // We modify the optimization to use the exact 2nd derivative.
- // Because we do checking and backtracking, the loss of
- // natural stability is OK.
- for (int32 i = 0; i < num_gauss; i++) {
- double gamma_jmi = gamma(i);
- SubVector<BaseFloat> wi(model->w_, i);
- num_deriv.AddVec(gamma_jmi, wi);
- double scalar = gamma_jm * w_jm(i); // expected count.
- den_deriv.AddVec(scalar, wi);
- if (scalar > 1.0e-10) // if-statement is a speedup
- weight_2nd_deriv.AddVec2(static_cast<BaseFloat>(scalar), wi);
- }
- Vector<double> total_linear_term(y);
- total_linear_term.AddVec(1.0, num_deriv);
- total_linear_term.AddVec(-1.0, den_deriv);
- if (gamma_jm > 0.0)
- weight_2nd_deriv.AddVec2(-1.0/gamma_jm, den_deriv);
-
- total_linear_term.AddSpVec(1.0, weight_2nd_deriv, v_jm, 1.0);
- // we want the derivatives around zero, not around the current point.
- // Correction for this.
-
- SpMatrix<double> total_quadratic_term(weight_2nd_deriv);
- total_quadratic_term.AddSp(1.0, X_jm);
-
- SolverOptions opts;
- opts.name = "v";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
- auxf_impr = SolveQuadraticProblem(total_quadratic_term,
- total_linear_term, opts, &v_jm);
- }
- }
- double objf_impr = exact_objf - exact_objf_start;
- tot_count += gamma_jm;
- tot_objf_impr += objf_impr;
- tot_auxf_impr += auxf_impr;
- if (backtrack_iter == max_backtrack) {
- KALDI_WARN << "Backtracked " << max_backtrack << " times [not updating]";
- } else {
- model->v_[j].Row(m).CopyFromVec(v_jm);
- }
- if (j < 3) {
- KALDI_LOG << "Objf impr for j = " << (j) << " m = " << (m) << " is "
- << objf_impr << " vs. quadratic auxf impr (before backtrack) "
- << auxf_impr;
- }
- }
-
- *like_impr_ptr = tot_objf_impr;
- *count_ptr = tot_count;
-
- tot_objf_impr /= (tot_count + 1.0e-20);
- tot_auxf_impr /= (tot_count + 1.0e-20);
-
- if (j_start == 0)
- KALDI_LOG << "**For first batch: objf impr for v is " << tot_objf_impr
- << " (auxf impr before backtracking:) " << tot_auxf_impr
- << " over " << tot_count << " frames";
-}
-
-
-void MleAmSgmmUpdater::RenormalizeV(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const SpMatrix<double> &H_sm) {
- SpMatrix<double> Sigma(accs.phn_space_dim_);
- int32 count = 0;
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- count++;
- Sigma.AddVec2(static_cast<BaseFloat>(1.0), model->v_[j].Row(m));
- }
- }
- Sigma.Scale(1.0 / count);
- int32 fixed_eigs = Sigma.LimitCondDouble(update_options_.max_cond);
- if (fixed_eigs != 0) {
- KALDI_WARN << "Scatter of vectors v is poorly conditioned. Fixed up "
- << fixed_eigs << " eigenvalues.";
- }
- KALDI_LOG << "Eigenvalues of scatter of vectors v is : ";
- Sigma.PrintEigs("Sigma");
- if (!Sigma.IsPosDef()) {
- KALDI_LOG << "Not renormalizing v because scatter is not positive definite"
- << " -- maybe first iter?";
- return;
- }
-
- // Want to make variance of v unit and H_sm (like precision matrix) diagonal.
- TpMatrix<double> L(accs.phn_space_dim_);
- L.Cholesky(Sigma);
- TpMatrix<double> LInv(L);
- LInv.Invert();
-
- Matrix<double> tmpL(accs.phn_space_dim_, accs.phn_space_dim_);
- tmpL.CopyFromTp(L);
-
- SpMatrix<double> H_sm_proj(accs.phn_space_dim_);
- H_sm_proj.AddMat2Sp(1.0, tmpL, kTrans, H_sm, 0.0);
- // H_sm_proj := L^{T} * H_sm * L.
- // This is right because we would transform the vectors themselves
- // by L^{-1}, and H_sm is like the inverse of the vectors,
- // so it's {L^{-1}}^{-T} = L^T.
-
- Matrix<double> U(accs.phn_space_dim_, accs.phn_space_dim_);
- Vector<double> eigs(accs.phn_space_dim_);
- H_sm_proj.SymPosSemiDefEig(&eigs, &U, 1.0); // 1.0 means no checking +ve def -> faster
- KALDI_LOG << "Note on the next diagnostic: the first number is generally not "
- << "that meaningful as it relates to the static offset";
- H_sm_proj.PrintEigs("H_sm_proj (Significance of dims in vector space.. note)");
-
- // Transform on vectors is U^T L^{-1}.
- // Why? Because transform on H_sm is T =U^T L^T
- // and we want T^{-T} by normal rules of vector/covector and we
- // have (U^T L^T)^{-T} = (L U)^{-1} = U^T L^{-1}.
- Matrix<double> Trans(accs.phn_space_dim_, accs.phn_space_dim_); // T^{-T}
- Matrix<double> tmpLInv(accs.phn_space_dim_, accs.phn_space_dim_);
- tmpLInv.CopyFromTp(LInv);
- Trans.AddMatMat(1.0, U, kTrans, tmpLInv, kNoTrans, 0.0);
- Matrix<double> TransInv(Trans);
- TransInv.Invert(); // T in above...
-
-#ifdef KALDI_PARANOID
- {
- SpMatrix<double> H_sm_tmp(accs.phn_space_dim_);
- H_sm_tmp.AddMat2Sp(1.0, TransInv, kTrans, H_sm, 0.0);
- KALDI_ASSERT(H_sm_tmp.IsDiagonal(0.1));
- }
- {
- SpMatrix<double> Sigma_tmp(accs.phn_space_dim_);
- Sigma_tmp.AddMat2Sp(1.0, Trans, kNoTrans, Sigma, 0.0);
- KALDI_ASSERT(Sigma_tmp.IsUnit(0.1));
- }
-#endif
-
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- Vector<double> tmp(accs.phn_space_dim_);
- tmp.AddMatVec(1.0, Trans, kNoTrans, Vector<double>(model->v_[j].Row(m)), 0.0);
- model->v_[j].Row(m).CopyFromVec(tmp);
- }
- }
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- Vector<double> tmp(accs.phn_space_dim_);
- tmp.AddMatVec(1.0, TransInv, kTrans, Vector<double>(model->w_.Row(i)), 0.0);
- model->w_.Row(i).CopyFromVec(tmp);
-
- Matrix<double> tmpM(accs.feature_dim_, accs.phn_space_dim_);
- // Multiplying on right not left so must not transpose TransInv.
- tmpM.AddMatMat(1.0, Matrix<double>(model->M_[i]), kNoTrans,
- TransInv, kNoTrans, 0.0);
- model->M_[i].CopyFromMat(tmpM);
- }
- KALDI_LOG << "Renormalized subspace.";
-}
-
-double MleAmSgmmUpdater::UpdateM(const MleAmSgmmAccs &accs,
- AmSgmm *model) {
- double tot_count = 0.0, tot_like_impr = 0.0;
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- double gamma_i = 0.0;
- for (int32 j = 0; j < accs.num_states_; j++)
- for (int32 m = 0; m < model->NumSubstates(j); m++)
- gamma_i += accs.gamma_[j](m, i);
-
- if (gamma_i < accs.feature_dim_) {
- KALDI_WARN << "For component " << i << ": not updating M due to very "
- << "small count (=" << gamma_i << ").";
- continue;
- }
-
- SolverOptions opts;
- opts.name = "M";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- Matrix<double> Mi(model->M_[i]);
- double impr = SolveQuadraticMatrixProblem(Q_[i], accs.Y_[i],
- SpMatrix<double>(model->SigmaInv_[i]),
- opts, &Mi);
- model->M_[i].CopyFromMat(Mi);
-
- if (i < 10) {
- KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is "
- << (impr/(gamma_i + 1.0e-20)) << " over " << gamma_i
- << " frames";
- }
- tot_count += gamma_i;
- tot_like_impr += impr;
- }
- tot_like_impr /= (tot_count + 1.0e-20);
- KALDI_LOG << "Overall objective function improvement for model projections "
- << "M is " << tot_like_impr << " over " << tot_count << " frames";
- return tot_like_impr;
-}
-
-// Estimate the parameters of a Gaussian prior over the M matrices. There are
-// as many mean matrices as UBM size and two covariance matrices for the rows
-// of M and columns of M. The prior means M_i are fixed to the unadapted values.
-// This is what was done in Lu, et al. "Maximum a posteriori adaptation of
-// subspace Gaussian mixture models for cross-lingual speech recognition",
-// ICASSP 2012.
-void MleAmSgmmUpdater::ComputeMPrior(AmSgmm *model) {
- KALDI_ASSERT(update_options_.map_M_prior_iters > 0);
- int32 Ddim = model->FeatureDim();
- int32 Sdim = model->PhoneSpaceDim();
- int32 nGaussians = model->NumGauss();
-
- // inverse variance of the columns of M: dim is # of rows
- model->col_cov_inv_.Resize(Ddim);
- // inverse covariance of the rows of M: dim is # of columns
- model->row_cov_inv_.Resize(Sdim);
-
- model->col_cov_inv_.SetUnit();
- model->row_cov_inv_.SetUnit();
-
- if (model->M_prior_.size() == 0) {
- model->M_prior_.resize(nGaussians);
- for (int32 i = 0; i < nGaussians; i++) {
- model->M_prior_[i].Resize(Ddim, Sdim);
- model->M_prior_[i].CopyFromMat(model->M_[i]); // We initialize Mpri as this
- }
- }
-
- if (update_options_.full_col_cov || update_options_.full_row_cov) {
- Matrix<double> avg_M(Ddim, Sdim); // average of the Gaussian prior means
- for (int32 i = 0; i < nGaussians; i++)
- avg_M.AddMat(1.0, Matrix<double>(model->M_prior_[i]));
- avg_M.Scale(1.0 / nGaussians);
-
- Matrix<double> MDiff(Ddim, Sdim);
- for (int32 iter = 0; iter < update_options_.map_M_prior_iters; iter++) {
- { // diagnostic block.
- double prior_like = -0.5 * nGaussians * (Ddim * Sdim * Log(2 * M_PI)
- + Sdim * (-model->row_cov_inv_.LogPosDefDet())
- + Ddim * (-model->col_cov_inv_.LogPosDefDet()));
- for (int32 i = 0; i < nGaussians; i++) {
- MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
- MDiff.AddMat(-1.0, avg_M); // MDiff = M_{i} - avg(M)
- SpMatrix<double> tmp(Ddim);
- // tmp = MDiff.Omega_r^{-1}*MDiff^T.
- tmp.AddMat2Sp(1.0, MDiff, kNoTrans,
- SpMatrix<double>(model->row_cov_inv_), 0.0);
- prior_like -= 0.5 * TraceSpSp(tmp, SpMatrix<double>(model->col_cov_inv_));
- }
- KALDI_LOG << "Before iteration " << iter
- << " of updating prior over M, log like per dimension modeled is "
- << prior_like / (nGaussians * Ddim * Sdim);
- }
-
- // First estimate the column covariances (\Omega_r in paper)
- if (update_options_.full_col_cov) {
- size_t limited;
- model->col_cov_inv_.SetZero();
- for (int32 i = 0; i < nGaussians; i++) {
- MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
- MDiff.AddMat(-1.0, avg_M); // MDiff = M_{i} - avg(M)
- // Omega_r += 1/(D*I) * Mdiff * Omega_c^{-1} * Mdiff^T
- model->col_cov_inv_.AddMat2Sp(1.0 / (Ddim * nGaussians),
- Matrix<BaseFloat>(MDiff), kNoTrans,
- model->row_cov_inv_, 1.0);
- }
- model->col_cov_inv_.PrintEigs("col_cov");
- limited = model->col_cov_inv_.LimitCond(update_options_.max_cond,
- true /*invert the matrix*/);
- if (limited != 0) {
- KALDI_LOG << "Computing column covariances for M: limited " << limited
- << " singular values, max condition is "
- << update_options_.max_cond;
- }
- }
-
- // Now estimate the row covariances (\Omega_c in paper)
- if (update_options_.full_row_cov) {
- size_t limited;
- model->row_cov_inv_.SetZero();
- for (int32 i = 0; i < nGaussians; i++) {
- MDiff.CopyFromMat(Matrix<double>(model->M_prior_[i]));
- MDiff.AddMat(-1.0, avg_M); // MDiff = M_{i} - avg(M)
- // Omega_c += 1/(S*I) * Mdiff^T * Omega_r^{-1} * Mdiff.
- model->row_cov_inv_.AddMat2Sp(1.0 / (Sdim * nGaussians),
- Matrix<BaseFloat>(MDiff), kTrans,
- model->col_cov_inv_, 1.0);
- }
- model->row_cov_inv_.PrintEigs("row_cov");
- limited = model->row_cov_inv_.LimitCond(update_options_.max_cond,
- true /*invert the matrix*/);
- if (limited != 0) {
- KALDI_LOG << "Computing row covariances for M: limited " << limited
- << " singular values, max condition is "
- << update_options_.max_cond;
- }
- }
- } // end iterations
- }
-}
-
-
-// MAP adaptation of M with a matrix-variate Gaussian prior
-double MleAmSgmmUpdater::MapUpdateM(const MleAmSgmmAccs &accs, AmSgmm *model) {
- int32 Ddim = model->FeatureDim();
- int32 Sdim = model->PhoneSpaceDim();
- int32 nGaussians = model->NumGauss();
-
- KALDI_LOG << "Prior smoothing parameter: Tau = " << update_options_.tau_map_M;
- if (model->M_prior_.size() == 0 || model->col_cov_inv_.NumRows() == 0
- || model->row_cov_inv_.NumRows() == 0) {
- KALDI_LOG << "Computing the prior first";
- ComputeMPrior(model);
- }
-
- Matrix<double> G(Ddim, Sdim);
- // \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}, depends on Gaussian index
- Matrix<double> prior_term_i(Ddim, Sdim);
- SpMatrix<double> P2(model->col_cov_inv_);
- SpMatrix<double> Q2(model->row_cov_inv_);
- Q2.Scale(update_options_.tau_map_M);
-
- double totcount = 0.0, tot_like_impr = 0.0;
- for (int32 i = 0; i < nGaussians; ++i) {
- double gamma_i = 0.0;
- for (int32 j = 0; j < accs.num_states_; ++j)
- for (int32 m = 0; m < model->NumSubstates(j); ++m)
- gamma_i += accs.gamma_[j](m, i);
-
- if (gamma_i < accs.feature_dim_) {
- KALDI_WARN << "For component " << i << ": not updating M due to very "
- << "small count (=" << gamma_i << ").";
- continue;
- }
-
- Matrix<double> tmp(Ddim, Sdim, kSetZero);
- tmp.AddSpMat(1.0, SpMatrix<double>(model->col_cov_inv_),
- Matrix<double>(model->M_prior_[i]), kNoTrans, 0.0);
- prior_term_i.AddMatSp(update_options_.tau_map_M, tmp, kNoTrans,
- SpMatrix<double>(model->row_cov_inv_), 0.0);
-
- Matrix<double> SigmaY(Ddim, Sdim, kSetZero);
- SigmaY.AddSpMat(1.0, SpMatrix<double>(model->SigmaInv_[i]), accs.Y_[i],
- kNoTrans, 0.0);
- G.CopyFromMat(SigmaY); // G = \Sigma_{i}^{-1} Y_{i}
- G.AddMat(1.0, prior_term_i); // G += \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}
- SpMatrix<double> P1(model->SigmaInv_[i]);
- Matrix<double> Mi(model->M_[i]);
-
- SolverOptions opts;
- opts.name = "M";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- double impr = SolveDoubleQuadraticMatrixProblem(G, P1, P2, Q_[i], Q2, opts, &Mi);
- model->M_[i].CopyFromMat(Mi);
- if (i < 10) {
- KALDI_LOG << "Objf impr for projection M for i = " << i << ", is "
- << (impr / (gamma_i + 1.0e-20)) << " over " << gamma_i
- << " frames";
- }
- totcount += gamma_i;
- tot_like_impr += impr;
- }
- tot_like_impr /= (totcount + 1.0e-20);
- KALDI_LOG << "Overall objective function improvement for model projections "
- << "M is " << tot_like_impr << " over " << totcount << " frames";
- return tot_like_impr;
-}
-
-
-/// This function gets stats used inside UpdateWParallel, where it accumulates
-/// the F_i and g_i quantities. Note: F_i is viewed as a vector of SpMatrix
-/// (one for each i); each row of F_i is viewed as an SpMatrix even though
-/// it's stored as a vector....
-/// Note: w is just a double-precision copy of the matrix model->w_
-
-// static
-void MleAmSgmmUpdater::UpdateWParallelGetStats(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const Matrix<double> &w,
- Matrix<double> *F_i,
- Matrix<double> *g_i,
- double *tot_like,
- int32 num_threads,
- int32 thread_id) {
-
- // Accumulate stats from a block of states (this gets called in parallel).
- int32 block_size = (accs.num_states_ + (num_threads-1)) / num_threads,
- j_start = block_size * thread_id,
- j_end = std::min(accs.num_states_, j_start + block_size);
-
- // Unlike in the report the inner most loop is over Gaussians, where
- // per-gaussian statistics are accumulated. This is more memory demanding
- // but more computationally efficient, as outer product v_{jvm} v_{jvm}^T
- // is computed only once for all gaussians.
-
- SpMatrix<double> v_vT(accs.phn_space_dim_);
-
- for (int32 j = j_start; j < j_end; j++) {
- int32 num_substates = model.NumSubstates(j);
- Matrix<double> w_jm(num_substates, accs.num_gaussians_);
- // The linear term and quadratic term for each Gaussian-- two scalars
- // for each Gaussian, they appear in the accumulation formulas.
- Matrix<double> linear_term(num_substates, accs.num_gaussians_);
- Matrix<double> quadratic_term(num_substates, accs.num_gaussians_);
- Matrix<double> v_vT_m(num_substates,
- (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2);
-
- // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7)
- Matrix<double> v_j_double(model.v_[j]);
- w_jm.AddMatMat(1.0, v_j_double, kNoTrans, w, kTrans, 0.0);
-
- for (int32 m = 0; m < model.NumSubstates(j); m++) {
- double gamma_jm = accs.gamma_[j].Row(m).Sum();
-
- w_jm.Row(m).Add(-1.0 * w_jm.Row(m).LogSumExp());
- *tot_like += VecVec(w_jm.Row(m), accs.gamma_[j].Row(m));
- w_jm.Row(m).ApplyExp();
- v_vT.SetZero();
- // v_vT := v_{jkm} v_{jkm}^T
- v_vT.AddVec2(static_cast<BaseFloat>(1.0), v_j_double.Row(m));
- v_vT_m.Row(m).CopyFromPacked(v_vT); // a bit wasteful, but does not dominate.
-
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- // Suggestion: g_jkm can be computed more efficiently
- // using the Vector/Matrix routines for all i at once
- // linear term around cur value.
- linear_term(m, i) = accs.gamma_[j](m, i) - gamma_jm * w_jm(m, i);
- quadratic_term(m, i) = std::max(accs.gamma_[j](m, i),
- gamma_jm * w_jm(m, i));
- }
- } // loop over substates
- g_i->AddMatMat(1.0, linear_term, kTrans, v_j_double, kNoTrans, 1.0);
- F_i->AddMatMat(1.0, quadratic_term, kTrans, v_vT_m, kNoTrans, 1.0);
- } // loop over states
-}
-
-// The parallel weight update, in the paper.
-double MleAmSgmmUpdater::UpdateWParallel(const MleAmSgmmAccs &accs,
- AmSgmm *model) {
- KALDI_LOG << "Updating weight projections";
-
- // tot_like_{after, before} are totals over multiple iterations,
- // not valid likelihoods. but difference is valid (when divided by tot_count).
- double tot_predicted_like_impr = 0.0, tot_like_before = 0.0,
- tot_like_after = 0.0;
-
- Matrix<double> g_i(accs.num_gaussians_, accs.phn_space_dim_);
- // View F_i as a vector of SpMatrix.
- Matrix<double> F_i(accs.num_gaussians_,
- (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2);
-
- Matrix<double> w(model->w_);
- double tot_count = 0.0;
- for (int32 j = 0; j < accs.num_states_; j++) tot_count += accs.gamma_[j].Sum();
-
- for (int iter = 0; iter < update_options_.weight_projections_iters; iter++) {
- F_i.SetZero();
- g_i.SetZero();
- double k_like_before = 0.0;
-
- UpdateWParallelClass c(accs, *model, w, &F_i, &g_i, &k_like_before);
- RunMultiThreaded(c);
-
- Matrix<double> w_orig(w);
- double k_predicted_like_impr = 0.0, k_like_after = 0.0;
- double min_step = 0.001, step_size;
- for (step_size = 1.0; step_size >= min_step; step_size /= 2) {
- k_predicted_like_impr = 0.0;
- k_like_after = 0.0;
-
- SolverOptions opts;
- opts.name = "w";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- // auxf is formulated in terms of change in w.
- Vector<double> delta_w(accs.phn_space_dim_);
- // returns objf impr with step_size = 1,
- // but it may not be 1 so we recalculate it.
- SpMatrix<double> this_F_i(accs.phn_space_dim_);
- this_F_i.CopyFromVec(F_i.Row(i));
- SolveQuadraticProblem(this_F_i, g_i.Row(i), opts, &delta_w);
-
- delta_w.Scale(step_size);
- double predicted_impr = VecVec(delta_w, g_i.Row(i)) -
- 0.5 * VecSpVec(delta_w, this_F_i, delta_w);
-
- // should never be negative because
- // we checked inside SolveQuadraticProblem.
- KALDI_ASSERT(predicted_impr >= -1.0e-05);
-
- if (i < 10) {
- KALDI_LOG << "Predicted objf impr for w (not per frame), iter = " <<
- (iter) << ", i = " << (i) << " is " << (predicted_impr);
- }
- k_predicted_like_impr += predicted_impr;
- w.Row(i).AddVec(1.0, delta_w);
- }
- Vector<double> w_jm_vec(accs.num_gaussians_);
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- w_jm_vec.AddMatVec(1.0, w, kNoTrans, Vector<double>(model->v_[j].Row(m)), 0.0);
- w_jm_vec.Add((-1.0) * w_jm_vec.LogSumExp());
- k_like_after += VecVec(w_jm_vec, accs.gamma_[j].Row(m));
- }
- }
- KALDI_VLOG(2) << "For iteration " << (iter) << ", updating w gives "
- << "predicted per-frame like impr "
- << (k_predicted_like_impr / tot_count) << ", actual "
- << ((k_like_after - k_like_before) / tot_count) << ", over "
- << (tot_count) << " frames";
- if (k_like_after < k_like_before) {
- w.CopyFromMat(w_orig); // Undo what we computed.
- if (fabs(k_like_after - k_like_before) / tot_count < 1.0e-05) {
- k_like_after = k_like_before;
- KALDI_WARN << "Not updating weights as not increasing auxf and "
- << "probably due to numerical issues (since small change).";
- break;
- } else {
- KALDI_WARN << "Halving step size for weights as likelihood did "
- << "not increase";
- }
- } else {
- break;
- }
- }
- if (step_size < min_step) {
- // Undo any step as we have no confidence that this is right.
- w.CopyFromMat(w_orig);
- } else {
- tot_predicted_like_impr += k_predicted_like_impr;
- tot_like_after += k_like_after;
- tot_like_before += k_like_before;
- }
- }
-
- model->w_.CopyFromMat(w);
-
- tot_predicted_like_impr /= tot_count;
- tot_like_after = (tot_like_after - tot_like_before) / tot_count;
- KALDI_LOG << "**Overall objf impr for w is " << tot_predicted_like_impr
- << ", actual " << tot_like_after << ", over "
- << tot_count << " frames";
- return tot_like_after;
-}
-
-double MleAmSgmmUpdater::UpdateWSequential(
- const MleAmSgmmAccs &accs, AmSgmm *model) {
- // Sequential version, in paper.
- /* This is the approach for the weight projections that
- * I originally implemented, in which we test the auxiliary function
- improvement for each i that we update. This requires some
- careful bookkeeping. It means that we need to store the
- total of the un-normalized weights for each j, m. */
-
- KALDI_LOG << "Updating weight projections [original approach, checking each"
- << "Gaussian component].";
-
- SpMatrix<double> v_vT(accs.phn_space_dim_);
- // tot_like_{after, before} are totals over multiple iterations,
- // not valid likelihoods...
- // but difference is valid (when divided by tot_count).
- double tot_delta_predicted = 0.0, tot_delta_observed = 0.0,
- tot_count = 0.0;
-
- Vector<double> w_jm(accs.num_gaussians_);
- Vector<double> g_i(accs.phn_space_dim_);
- SpMatrix<double> F_i(accs.phn_space_dim_);
-
- double k_count = 0.0;
- // Total count in each substate.
- std::vector< Vector<double> > gamma_jm(accs.num_states_);
- for (int32 j = 0; j < accs.num_states_; j++) { // Initialize gamma_jm
- gamma_jm[j].Resize(model->NumSubstates(j));
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- k_count += (gamma_jm[j](m) = accs.gamma_[j].Row(m).Sum());
- }
- }
-
- Matrix<double> w(model->w_);
-
- for (int iter = 0; iter < update_options_.weight_projections_iters; iter++) {
- double k_delta_predicted = 0.0, k_delta_observed = 0.0;
-
- // log total of un-normalized weights for each j, m
- std::vector< Vector<double> > weight_tots(accs.num_states_);
-
- // Initialize weight_tots
- for (int32 j = 0; j < accs.num_states_; j++) {
- weight_tots[j].Resize(model->NumSubstates(j));
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- w_jm.AddMatVec(1.0, w, kNoTrans, Vector<double>(model->v_[j].Row(m)), 0.0);
- weight_tots[j](m) = w_jm.LogSumExp();
- }
- }
-
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- F_i.SetZero();
- g_i.SetZero();
- SubVector<double> w_i = w.Row(i);
-
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- double this_unnormalized_weight = VecVec(w_i, model->v_[j].Row(m));
- double normalizer = weight_tots[j](m);
- double this_log_w = this_unnormalized_weight - normalizer,
- this_w = Exp(this_log_w),
- substate_count = gamma_jm[j](m),
- this_count = accs.gamma_[j](m, i);
-
- double linear_term = this_count - substate_count * this_w;
- double quadratic_term = std::max(this_count, substate_count * this_w);
-
- g_i.AddVec(linear_term, model->v_[j].Row(m));
- // should not ever be zero, but check anyway.
- if (quadratic_term != 0.0)
- F_i.AddVec2(static_cast<BaseFloat>(quadratic_term), model->v_[j].Row(m));
- }
- }
-
- SolverOptions opts;
- opts.name = "w";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- // auxf is formulated in terms of change in w.
- Vector<double> delta_w(accs.phn_space_dim_);
- // returns objf impr with step_size = 1,
- // but it may not be 1 so we recalculate it.
- SolveQuadraticProblem(F_i,
- g_i,
- opts,
- &delta_w);
-
- try { // In case we have a problem in LogSub.
- double step_size, min_step = 0.0001;
- for (step_size = 1.0; step_size >= min_step; step_size /= 2) {
- Vector<double> new_w_i(w_i);
- // copy it in case we do not commit this change.
- std::vector<Vector<double> > new_weight_tots(weight_tots);
- new_w_i.AddVec(step_size, delta_w);
- double predicted_impr = step_size * VecVec(delta_w, g_i) -
- 0.5 * step_size * step_size * VecSpVec(delta_w, F_i, delta_w);
- if (predicted_impr < -0.1) {
- KALDI_WARN << "Negative predicted auxf improvement " <<
- (predicted_impr) << ", not updating this gaussian " <<
- "(either numerical problems or a code mistake.";
- break;
- }
- // Now compute observed objf change.
- double observed_impr = 0.0, this_tot_count = 0.0;
-
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- double old_unnorm_weight = VecVec(w_i, model->v_[j].Row(m)),
- new_unnorm_weight = VecVec(new_w_i, model->v_[j].Row(m)),
- substate_count = gamma_jm[j](m),
- this_count = accs.gamma_[j](m, i);
- this_tot_count += this_count;
- observed_impr += this_count * // from numerator.
- (new_unnorm_weight - old_unnorm_weight);
- double old_normalizer = new_weight_tots[j](m), delta;
- if (new_unnorm_weight > old_unnorm_weight) {
- delta = LogAdd(0, LogSub(new_unnorm_weight - old_normalizer,
- old_unnorm_weight - old_normalizer));
- } else {
- delta = LogSub(0, LogSub(old_unnorm_weight - old_normalizer,
- new_unnorm_weight - old_normalizer));
- // The if-statement above is equivalent to:
- // delta = LogAdd(LogSub(0,
- // old_unnorm_weight-old_normalizer),
- // new_unnorm_weight-old_normalizer)
- // but has better behaviour numerically.
- }
- observed_impr -= substate_count * delta;
- new_weight_tots[j](m) += delta;
- }
- }
- if (observed_impr < 0.0) { // failed, so we reduce step size.
- KALDI_LOG << "Updating weights, for i = " << (i) << ", predicted "
- "auxf: " << (predicted_impr/(this_tot_count + 1.0e-20))
- << ", observed " << observed_impr/(this_tot_count + 1.0e-20)
- << " over " << this_tot_count << " frames. Reducing step size "
- << "to " << (step_size/2);
- if (predicted_impr / (this_tot_count + 1.0e-20) < 1.0e-07) {
- KALDI_WARN << "Not updating this weight vector as auxf decreased"
- << " probably due to numerical issues (since small change).";
- break;
- }
- } else {
- if (i < 10)
- KALDI_LOG << "Updating weights, for i = " << (i)
- << ", auxf change per frame is" << ": predicted " <<
- (predicted_impr /(this_tot_count + 1.0e-20)) << ", observed "
- << (observed_impr / (this_tot_count + 1.0e-20))
- << " over " << (this_tot_count) << " frames.";
-
- k_delta_predicted += predicted_impr;
- k_delta_observed += observed_impr;
- w.Row(i).CopyFromVec(new_w_i);
- weight_tots = new_weight_tots; // Copy over normalizers.
- break;
- }
- }
- } catch(...) {
- KALDI_LOG << "Warning: weight update for i = " << i
- << " failed, possible numerical problem.";
- }
- }
- KALDI_LOG << "For iteration " << iter << ", updating w gives predicted "
- << "per-frame like impr " << (k_delta_predicted / k_count) <<
- ", observed " << (k_delta_observed / k_count) << ", over " << (k_count)
- << " frames";
- if (iter == 0) tot_count += k_count;
- tot_delta_predicted += k_delta_predicted;
- tot_delta_observed += k_delta_observed;
- }
-
- model->w_.CopyFromMat(w);
-
- tot_delta_observed /= tot_count;
- tot_delta_predicted /= tot_count;
- KALDI_LOG << "**Overall objf impr for w is " << tot_delta_predicted
- << ", observed " << tot_delta_observed << ", over "
- << tot_count << " frames";
- return tot_delta_observed;
-}
-
-double MleAmSgmmUpdater::UpdateN(const MleAmSgmmAccs &accs,
- AmSgmm *model) {
- double tot_count = 0.0, tot_like_impr = 0.0;
- if (accs.spk_space_dim_ == 0 || accs.R_.size() == 0 || accs.Z_.size() == 0) {
- KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated";
- }
-
- Vector<double> gamma_i(accs.num_gaussians_);
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- gamma_i.AddVec(1.0, accs.gamma_[j].Row(m));
- }
- }
-
- SolverOptions opts;
- opts.name = "N";
- opts.K = update_options_.max_cond;
- opts.eps = update_options_.epsilon;
-
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- if (gamma_i(i) < 2 * accs.spk_space_dim_) {
- KALDI_WARN << "Not updating speaker basis for i = " << (i)
- << " because count is too small " << (gamma_i(i));
- continue;
- }
- Matrix<double> Ni(model->N_[i]);
- double impr =
- SolveQuadraticMatrixProblem(accs.R_[i], accs.Z_[i],
- SpMatrix<double>(model->SigmaInv_[i]),
- opts, &Ni);
- model->N_[i].CopyFromMat(Ni);
- if (i < 10) {
- KALDI_LOG << "Objf impr for spk projection N for i = " << (i)
- << ", is " << (impr / (gamma_i(i) + 1.0e-20)) << " over "
- << (gamma_i(i)) << " frames";
- }
- tot_count += gamma_i(i);
- tot_like_impr += impr;
- }
-
- tot_like_impr /= (tot_count+1.0e-20);
- KALDI_LOG << "**Overall objf impr for N is " << tot_like_impr << " over "
- << tot_count << " frames";
- return tot_like_impr;
-}
-
-void MleAmSgmmUpdater::RenormalizeN(
- const MleAmSgmmAccs &accs, AmSgmm *model) {
- KALDI_ASSERT(accs.R_.size() != 0);
- Vector<double> gamma_i(accs.num_gaussians_);
- for (int32 j = 0; j < accs.num_states_; j++) {
- for (int32 m = 0; m < model->NumSubstates(j); m++) {
- gamma_i.AddVec(1.0, accs.gamma_[j].Row(m));
- }
- }
- double tot_count = gamma_i.Sum();
- if (tot_count == 0) {
- KALDI_WARN << "Not renormalizing N, since there are no counts.";
- return;
- }
-
- SpMatrix<double> RTot(accs.spk_space_dim_);
- // for (int32 i = 0; i < accs.num_gaussians_; i++) {
- // RTot.AddSp(1.0, accs.R_[i]);
- // }
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- RTot.AddSp(gamma_i(i), accs.R_[i]);
- }
- RTot.Scale(1.0 / tot_count);
- Matrix<double> U(accs.spk_space_dim_, accs.spk_space_dim_);
- Vector<double> eigs(accs.spk_space_dim_);
- RTot.SymPosSemiDefEig(&eigs, &U);
- KALDI_LOG << "Renormalizing N, eigs are: " << (eigs);
- Vector<double> sqrteigs(accs.spk_space_dim_);
- for (int32 t = 0; t < accs.spk_space_dim_; t++) {
- sqrteigs(t) = sqrt(eigs(t));
- }
- // e.g. diag(eigs)^{-0.5} * U' * RTot * U * diag(eigs)^{-0.5} = 1
- // But inverse transpose of this transformation needs to take place on R,
- // i.e. not (on left: diag(eigs)^{-0.5} * U')
- // but: (inverse it: U . diag(eigs)^{0.5},
- // transpose it: diag(eigs)^{0.5} U^T. Need to do this on the right to N
- // (because N has the spk vecs on the right), so N := N U diag(eigs)^{0.5}
- U.MulColsVec(sqrteigs);
- Matrix<double> Ntmp(accs.feature_dim_, accs.spk_space_dim_);
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- Ntmp.AddMatMat(1.0, Matrix<double>(model->N_[i]), kNoTrans, U, kNoTrans, 0.0);
- model->N_[i].CopyFromMat(Ntmp);
- }
-}
-
-
-double MleAmSgmmUpdater::UpdateVars(const MleAmSgmmAccs &accs,
- AmSgmm *model) {
- KALDI_ASSERT(S_means_.size() == static_cast<size_t>(accs.num_gaussians_) &&
- "Must call PreComputeStats before updating the covariances.");
- SpMatrix<double> Sigma_i(accs.feature_dim_), Sigma_i_ml(accs.feature_dim_);
- double tot_objf_impr = 0.0, tot_t = 0.0;
- SpMatrix<double> covfloor(accs.feature_dim_);
- Vector<double> gamma_vec(accs.num_gaussians_);
- Vector<double> objf_improv(accs.num_gaussians_);
-
- // First pass over all (shared) Gaussian components to calculate the
- // ML estimate of the covariances, and the total covariance for flooring.
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- double gamma_i = 0;
- for (int32 j = 0; j < accs.num_states_; j++)
- for (int32 m = 0, end = model->NumSubstates(j); m < end; m++)
- gamma_i += accs.gamma_[j](m, i);
-
- // Eq. (75): Sigma_{i}^{ml} = 1/gamma_{i} [S_{i} + S_{i}^{(means)} - ...
- // Y_{i} M_{i}^T - M_{i} Y_{i}^T]
- // Note the S_means_ already contains the Y_{i} M_{i}^T terms.
- Sigma_i_ml.CopyFromSp(S_means_[i]);
- Sigma_i_ml.AddSp(1.0, accs.S_[i]);
-
- gamma_vec(i) = gamma_i;
- covfloor.AddSp(1.0, Sigma_i_ml);
- // inverting small values e.g. 4.41745328e-40 seems to generate inf,
- // although would be fixed up later.
- if (gamma_i > 1.0e-20) {
- Sigma_i_ml.Scale(1 / (gamma_i + 1.0e-20));
- } else {
- Sigma_i_ml.SetUnit();
- }
- KALDI_ASSERT(1.0 / Sigma_i_ml(0, 0) != 0.0);
- // Eq. (76): Compute the objective function with the old parameter values
- objf_improv(i) = model->SigmaInv_[i].LogPosDefDet() -
- TraceSpSp(SpMatrix<double>(model->SigmaInv_[i]), Sigma_i_ml);
-
- model->SigmaInv_[i].CopyFromSp(Sigma_i_ml); // inverted in the next loop.
- }
-
- // Compute the covariance floor.
- if (gamma_vec.Sum() == 0) { // If no count, use identity.
- KALDI_WARN << "Updating variances: zero counts. Setting floor to unit.";
- covfloor.SetUnit();
- } else { // else, use the global average covariance.
- covfloor.Scale(update_options_.cov_floor / gamma_vec.Sum());
- int32 tmp;
- if ((tmp = covfloor.LimitCondDouble(update_options_.max_cond)) != 0) {
- KALDI_WARN << "Covariance flooring matrix is poorly conditioned. Fixed "
- << "up " << (tmp) << " eigenvalues.";
- }
- }
-
- if (update_options_.cov_diag_ratio > 1000) {
- KALDI_LOG << "Assuming you want to build a diagonal system since "
- << "cov_diag_ratio is large: making diagonal covFloor.";
- for (int32 i = 0; i < covfloor.NumRows(); i++)
- for (int32 j = 0; j < i; j++)
- covfloor(i, j) = 0.0;
- }
-
- // Second pass over all (shared) Gaussian components to calculate the
- // floored estimate of the covariances, and update the model.
- for (int32 i = 0; i < accs.num_gaussians_; i++) {
- Sigma_i.CopyFromSp(model->SigmaInv_[i]);
- Sigma_i_ml.CopyFromSp(Sigma_i);
- // In case of insufficient counts, make the covariance matrix diagonal.
- // cov_diag_ratio is 2 by default, set to very large to always get diag-cov
- if (gamma_vec(i) < update_options_.cov_diag_ratio * accs.feature_dim_) {
- KALDI_WARN << "For Gaussian component " << i << ": Too low count "
- << gamma_vec(i) << " for covariance matrix estimation. Setting to "
- << "diagonal";
- for (int32 d = 0; d < accs.feature_dim_; d++)
- for (int32 e = 0; e < d; e++)
- Sigma_i(d, e) = 0.0; // SpMatrix, can only set lower traingular part
-
- int floored = Sigma_i.ApplyFloor(covfloor);
- if (floored > 0) {
- KALDI_WARN << "For Gaussian component " << i << ": Floored " << floored
- << " covariance eigenvalues.";
- }
- model->SigmaInv_[i].CopyFromSp(Sigma_i);
- model->SigmaInv_[i].InvertDouble();
- } else { // Updating the full covariance matrix.
- try {
- int floored = Sigma_i.ApplyFloor(covfloor);
- if (floored > 0) {
- KALDI_WARN << "For Gaussian component " << i << ": Floored "
- << floored << " covariance eigenvalues.";
- }
- model->SigmaInv_[i].CopyFromSp(Sigma_i);
- model->SigmaInv_[i].InvertDouble();
-
- objf_improv(i) += Sigma_i.LogPosDefDet() +
- TraceSpSp(SpMatrix<double>(model->SigmaInv_[i]), Sigma_i_ml);
- objf_improv(i) *= (-0.5 * gamma_vec(i)); // Eq. (76)
-
- tot_objf_impr += objf_improv(i);
- tot_t += gamma_vec(i);
- if (i < 5) {
- KALDI_VLOG(2) << "objf impr from variance update =" << objf_improv(i)
- / (gamma_vec(i) + 1.0e-20) << " over " << (gamma_vec(i))
- << " frames for i = " << (i);
- }
- } catch(...) {
- KALDI_WARN << "Updating within-class covariance matrix i = " << (i)
- << ", numerical problem";
- // This is a catch-all thing in case of unanticipated errors, but
- // flooring should prevent this occurring for the most part.
- model->SigmaInv_[i].SetUnit(); // Set to unit.
- }
- }
- }
- KALDI_LOG << "**Overall objf impr for variance update = "
- << (tot_objf_impr / (tot_t+ 1.0e-20))
- << " over " << (tot_t) << " frames";
- return tot_objf_impr / (tot_t + 1.0e-20);
-}
-
-
-double MleAmSgmmUpdater::UpdateSubstateWeights(
- const MleAmSgmmAccs &accs, AmSgmm *model) {
- KALDI_LOG << "Updating substate mixture weights";
- // Also set the vector gamma_j which is a cache of the state occupancies.
- gamma_j_.Resize(accs.num_states_);
-
- double tot_gamma = 0.0, objf_impr = 0.0;
- for (int32 j = 0; j < accs.num_states_; j++) {
- double gamma_j_sm = 0.0;
- int32 num_substates = model->NumSubstates(j);
- Vector<double> occs(num_substates),
- smoothed_occs(num_substates);
- for (int32 m = 0; m < num_substates; m++) {
- occs(m) = accs.gamma_[j].Row(m).Sum(); // \sum_i gamma_{jmi}
- gamma_j_(j) += occs(m); // actual state occupancy.
- smoothed_occs(m) = occs(m) + update_options_.tau_c;
- gamma_j_sm += smoothed_occs(m); // smoothed state occupancy for update.
- }
-
- for (int32 m = 0; m < num_substates; m++) {
- double cur_weight = model->c_[j](m);
- if (cur_weight <= 0) {
- KALDI_WARN << "Zero or negative weight, flooring";
- cur_weight = 1.0e-10; // future work(arnab): remove magic numbers
- }
- model->c_[j](m) = smoothed_occs(m) / gamma_j_sm;
- objf_impr += Log(model->c_[j](m) / cur_weight) * occs(m);
- }
- tot_gamma += gamma_j_(j);
- }
- objf_impr /= (tot_gamma + 1.0e-20);
- KALDI_LOG << "**Overall objf impr for c is " << objf_impr << ", over "
- << tot_gamma << " frames.";
- return objf_impr;
-}
-
-
-MleSgmmSpeakerAccs::MleSgmmSpeakerAccs(const AmSgmm &model, BaseFloat prune)
- : rand_prune_(prune) {
- KALDI_ASSERT(model.SpkSpaceDim() != 0);
- H_spk_.resize(model.NumGauss());
- for (int32 i = 0; i < model.NumGauss(); i++) {
- // Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
- H_spk_[i].Resize(model.SpkSpaceDim());
- H_spk_[i].AddMat2Sp(1.0, Matrix<double>(model.N_[i]),
- kTrans, SpMatrix<double>(model.SigmaInv_[i]), 0.0);
- }
-
- model.GetNtransSigmaInv(&NtransSigmaInv_);
-
- gamma_s_.Resize(model.NumGauss());
- y_s_.Resize(model.SpkSpaceDim());
-}
-
-void MleSgmmSpeakerAccs::Clear() {
- y_s_.SetZero();
- gamma_s_.SetZero();
-}
-
-
-BaseFloat
-MleSgmmSpeakerAccs::Accumulate(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- int32 j,
- BaseFloat weight) {
- // Calculate Gaussian posteriors and collect statistics
- Matrix<BaseFloat> posteriors;
- BaseFloat log_like = model.ComponentPosteriors(frame_vars, j, &posteriors);
- posteriors.Scale(weight);
- AccumulateFromPosteriors(model, frame_vars, posteriors, j);
- return log_like;
-}
-
-BaseFloat
-MleSgmmSpeakerAccs::AccumulateFromPosteriors(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- const Matrix<BaseFloat> &posteriors,
- int32 j) {
- double tot_count = 0.0;
- int32 feature_dim = model.FeatureDim(),
- spk_space_dim = model.SpkSpaceDim();
- KALDI_ASSERT(spk_space_dim != 0);
- const vector<int32> &gselect = frame_vars.gselect;
-
- // Intermediate variables
- Vector<double> xt_jmi(feature_dim), mu_jmi(feature_dim),
- zt_jmi(spk_space_dim);
- int32 num_substates = model.NumSubstates(j);
- for (int32 ki = 0; ki < static_cast<int32>(gselect.size()); ki++) {
- int32 i = gselect[ki];
- for (int32 m = 0; m < num_substates; m++) {
- // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t)
- BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_);
- if (gammat_jmi != 0.0) {
- tot_count += gammat_jmi;
- model.GetSubstateMean(j, m, i, &mu_jmi);
- xt_jmi.CopyFromVec(frame_vars.xt);
- xt_jmi.AddVec(-1.0, mu_jmi);
- // Eq. (48): z{jmi}(t) = N_{i}^{T} \Sigma_{i}^{-1} x_{jmi}(t)
- zt_jmi.AddMatVec(1.0, NtransSigmaInv_[i], kNoTrans, xt_jmi, 0.0);
- // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi}
- gamma_s_(i) += gammat_jmi;
- // Eq. (50): y^{(s)} = \sum_{t, j, m, i} gamma_{jmi}(t) z_{jmi}(t)
- y_s_.AddVec(gammat_jmi, zt_jmi);
- }
- }
- }
- return tot_count;
-}
-
-void MleSgmmSpeakerAccs::Update(BaseFloat min_count,
- Vector<BaseFloat> *v_s,
- BaseFloat *objf_impr_out,
- BaseFloat *count_out) {
- double tot_gamma = gamma_s_.Sum();
- KALDI_ASSERT(y_s_.Dim() != 0);
- int32 T = y_s_.Dim(); // speaker-subspace dim.
- int32 num_gauss = gamma_s_.Dim();
- if (v_s->Dim() != T) v_s->Resize(T); // will set it to zero.
-
- if (tot_gamma < min_count) {
- KALDI_WARN << "Updating speaker vectors, count is " << tot_gamma
- << " < " << min_count << "not updating.";
- if (objf_impr_out) *objf_impr_out = 0.0;
- if (count_out) *count_out = 0.0;
- return;
- }
-
- // Eq. (84): H^{(s)} = \sum_{i} \gamma_{i}(s) H_{i}^{spk}
- SpMatrix<double> H_s(T);
-
- for (int32 i = 0; i < num_gauss; i++)
- H_s.AddSp(gamma_s_(i), H_spk_[i]);
-
-
- // Don't make these options to SolveQuadraticProblem configurable...
- // they really don't make a difference at all unless the matrix in
- // question is singular, which wouldn't happen in this case.
- Vector<double> v_s_dbl(*v_s);
- double tot_objf_impr =
- SolveQuadraticProblem(H_s, y_s_, SolverOptions("v_s"), &v_s_dbl);
- v_s->CopyFromVec(v_s_dbl);
-
- KALDI_LOG << "*Objf impr for speaker vector is " << (tot_objf_impr / tot_gamma)
- << " over " << (tot_gamma) << " frames.";
-
- if (objf_impr_out) *objf_impr_out = tot_objf_impr;
- if (count_out) *count_out = tot_gamma;
-}
-
-
-MleAmSgmmAccs::~MleAmSgmmAccs() {
- if (gamma_s_.Sum() != 0.0)
- KALDI_ERR << "In destructor of MleAmSgmmAccs: detected that you forgot to "
- "call CommitStatsForSpk()";
-}
-
-
-} // namespace kaldi
diff --git a/src/sgmm/estimate-am-sgmm.h b/src/sgmm/estimate-am-sgmm.h
+++ /dev/null
@@ -1,475 +0,0 @@
-// sgmm/estimate-am-sgmm.h
-
-// Copyright 2009-2011 Microsoft Corporation; Lukas Burget;
-// Saarland University (Author: Arnab Ghoshal);
-// Ondrej Glembek; Yanmin Qian;
-// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey)
-// Liang Lu; Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM_ESTIMATE_AM_SGMM_H_
-#define KALDI_SGMM_ESTIMATE_AM_SGMM_H_ 1
-
-#include <string>
-#include <vector>
-
-#include "sgmm/am-sgmm.h"
-#include "gmm/model-common.h"
-#include "itf/options-itf.h"
-#include "sgmm/sgmm-clusterable.h"
-#include "thread/kaldi-thread.h" // for MultiThreadable
-
-namespace kaldi {
-
-/** \struct MleAmSgmmOptions
- * Configuration variables needed in the SGMM estimation process.
- */
-struct MleAmSgmmOptions {
- /// Configuration Parameters. See initialization code for more comments.
- BaseFloat tau_vec; ///< Amount of smoothing for v_{jm} update
- BaseFloat tau_c; ///< Tau value for smoothing substate weights (c)
- /// Floor covariance matrices Sigma_i to this times average cov.
- BaseFloat cov_floor;
- /// ratio to dim below which we use diagonal. default 2, set to inf for diag.
- BaseFloat cov_diag_ratio;
- /// Max on condition of matrices in update beyond which we do not update.
- /// Should probably be related to numerical properties of machine
- /// or BaseFloat type.
- BaseFloat max_cond;
- /// Limits condition of smoothing matrices H_sm (e.g. 100).
- /// Only really important on 1st iter if using priors.
- BaseFloat max_cond_H_sm;
- /// Fix for the smoothing approach, necessary if max_cond_H_sm != inf
- /// note: only has an effect if tau_vec != 0.
- bool fixup_H_sm;
- /// Set check_v to true if you want to use the "checking" version of the update
- /// for the v's, in which it checks the "real" objective function value and
- /// backtracks if necessary;
- bool check_v;
-
- bool renormalize_V; // Renormalize the phonetic space.
- bool renormalize_N; // Renormalize the speaker space.
-
- /// Number of iters when re-estimating weight projections "w".
- int weight_projections_iters;
- /// The "sequential" weight update that checks each i in turn.
- /// (if false, uses the "parallel" one).
- bool use_sequential_weight_update;
-
- BaseFloat epsilon; ///< very small value used to prevent SVD crashing.
-
- BaseFloat tau_map_M; ///< For MAP update of the phonetic subspace M
- int map_M_prior_iters; ///< num of iterations to update the prior of M
- bool full_row_cov; ///< Estimate row covariance instead of using I
- bool full_col_cov; ///< Estimate col covariance instead of using I
-
- MleAmSgmmOptions() {
- // tau value used in smoothing vector re-estimation (if no prior used).
- tau_vec = 0.0;
- tau_c = 5.0;
- cov_floor = 0.025;
- cov_diag_ratio = 2.0; // set to very large to get diagonal-cov models.
- max_cond = 1.0e+05;
- epsilon = 1.0e-40;
- max_cond_H_sm = 1.0e+05; // only for diagnostics in normal situations.
- fixup_H_sm = true;
- check_v = false; // for back-compat.
- renormalize_V = true;
- renormalize_N = false; // default to false since will invalidate spk vectors
- // on disk.
- weight_projections_iters = 3;
- use_sequential_weight_update = false;
-
- map_M_prior_iters = 5;
- tau_map_M = 0.0; // No MAP update by default (~500-1000 depending on prior)
- full_row_cov = false;
- full_col_cov = false;
- }
-
- void Register(OptionsItf *opts) {
- std::string module = "MleAmSgmmOptions: ";
- opts->Register("tau-vec", &tau_vec, module+
- "Smoothing for phone vector estimation.");
- opts->Register("tau-c", &tau_c, module+
- "Smoothing for substate weights estimation.");
- opts->Register("cov-floor", &cov_floor, module+
- "Covariance floor (fraction of average covariance).");
- opts->Register("cov-diag-ratio", &cov_diag_ratio, module+
- "Minimum occ/dim ratio below which use diagonal covariances.");
- opts->Register("max-cond", &max_cond, module+"Maximum condition number beyond"
- " which matrices are not updated.");
- opts->Register("weight-projections-iters", &weight_projections_iters, module+
- "Number for iterations for weight projection estimation.");
- opts->Register("renormalize-v", &renormalize_V, module+"If true, renormalize "
- "the phonetic-subspace vectors to have meaningful sizes.");
- opts->Register("check-v", &check_v, module+"If true, check real auxf "
- "improvement in update of v and backtrack if needed "
- "(not compatible with smoothing v)");
- opts->Register("renormalize-n", &renormalize_N, module+"If true, renormalize "
- "the speaker subspace to have meaningful sizes.");
-
- opts->Register("tau-map-M", &tau_map_M, module+"Smoothing for MAP estimate "
- "of M (0 means ML update).");
- opts->Register("map-M-prior-iters", &map_M_prior_iters, module+
- "Number of iterations to estimate prior covariances for M.");
- opts->Register("full-row-cov", &full_row_cov, module+
- "Estimate row covariance instead of using I.");
- opts->Register("full-col-cov", &full_col_cov, module+
- "Estimate column covariance instead of using I.");
- }
-};
-
-/** \class MleAmSgmmAccs
- * Class for the accumulators associated with the SGMM parameters except
- * speaker vectors.
- */
-class MleAmSgmmAccs {
- public:
- explicit MleAmSgmmAccs(BaseFloat rand_prune = 1.0e-05)
- : total_frames_(0.0), total_like_(0.0), feature_dim_(0),
- phn_space_dim_(0), spk_space_dim_(0), num_gaussians_(0),
- num_states_(0), rand_prune_(rand_prune) {}
-
- MleAmSgmmAccs(const AmSgmm &model, SgmmUpdateFlagsType flags,
- BaseFloat rand_prune = 1.0e-05)
- : total_frames_(0.0), total_like_(0.0), rand_prune_(rand_prune) {
- ResizeAccumulators(model, flags);
- }
-
- ~MleAmSgmmAccs();
-
- void Read(std::istream &in_stream, bool binary, bool add);
- void Write(std::ostream &out_stream, bool binary) const;
-
- /// Checks the various accumulators for correct sizes given a model. With
- /// wrong sizes, assertion failure occurs. When the show_properties argument
- /// is set to true, dimensions and presence/absence of the various
- /// accumulators are printed. For use when accumulators are read from file.
- void Check(const AmSgmm &model, bool show_properties = true) const;
-
- /// Resizes the accumulators to the correct sizes given the model. The flags
- /// argument control which accumulators to resize.
- void ResizeAccumulators(const AmSgmm &model, SgmmUpdateFlagsType flags);
-
- /// Returns likelihood.
- BaseFloat Accumulate(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- const VectorBase<BaseFloat> &v_s, // spk-vec, may be empty
- int32 state_index, BaseFloat weight,
- SgmmUpdateFlagsType flags);
-
- /// Returns count accumulated (may differ from posteriors.Sum()
- /// due to weight pruning).
- BaseFloat AccumulateFromPosteriors(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- const Matrix<BaseFloat> &posteriors,
- const VectorBase<BaseFloat> &v_s, // may be empty
- int32 state_index,
- SgmmUpdateFlagsType flags);
-
- /// Accumulates global stats for the current speaker (if applicable).
- /// If flags contains kSgmmSpeakerProjections (N), must call
- /// this after finishing the speaker's data.
- void CommitStatsForSpk(const AmSgmm &model,
- const VectorBase<BaseFloat> &v_s);
-
- /// Accessors
- void GetStateOccupancies(Vector<BaseFloat> *occs) const;
- const std::vector< Matrix<double> >& GetOccs() const {
- return gamma_;
- }
- int32 FeatureDim() const { return feature_dim_; }
- int32 PhoneSpaceDim() const { return phn_space_dim_; }
- int32 NumStates() const { return num_states_; }
- int32 NumGauss() const { return num_gaussians_; }
- double TotalFrames() const { return total_frames_; }
- double TotalLike() const { return total_like_; }
-
- private:
- /// The stats which are not tied to any state.
- /// Stats Y_{i} for phonetic-subspace projections M; Dim is [I][D][S].
- std::vector< Matrix<double> > Y_;
- /// Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T].
- std::vector< Matrix<double> > Z_;
- /// R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T]
- std::vector< SpMatrix<double> > R_;
- /// S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D].
- std::vector< SpMatrix<double> > S_;
-
- /// The SGMM state specific stats.
- /// Statistics y_{jm} for state vectors v_{jm}. dimension is [J][M_{j}[S].
- std::vector< Matrix<double> > y_;
- /// Gaussian occupancies gamma_{jmi} for each substate. Dim is [J][M_{j}][I].
- std::vector< Matrix<double> > gamma_;
-
- /// gamma_{i}^{(s)}. Per-speaker counts for each Gaussian. Dimension is [I]
- /// Needed for stats R_.
- Vector<double> gamma_s_;
-
- double total_frames_, total_like_;
-
- /// Dimensionality of various subspaces
- int32 feature_dim_, phn_space_dim_, spk_space_dim_;
- int32 num_gaussians_, num_states_; ///< Other model specifications
-
- BaseFloat rand_prune_;
-
- KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmAccs);
- friend class MleAmSgmmUpdater;
- friend class EbwAmSgmmUpdater;
- friend class MleAmSgmmGlobalAccs;
-};
-
-/** \class MleAmSgmmUpdater
- * Contains the functions needed to update the SGMM parameters.
- */
-class MleAmSgmmUpdater {
- public:
- explicit MleAmSgmmUpdater(const MleAmSgmmOptions &options)
- : update_options_(options) {}
- void Reconfigure(const MleAmSgmmOptions &options) {
- update_options_ = options;
- }
-
- /// Main update function: Computes some overall stats, does parameter updates
- /// and returns the total improvement of the different auxiliary functions.
- BaseFloat Update(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- SgmmUpdateFlagsType flags);
-
- /// This function is like UpdatePhoneVectorsChecked, which supports
- /// objective-function checking and backtracking but no smoothing term, but it
- /// takes as input the stats used in SGMM-based tree clustering-- this is used
- /// in initializing an SGMM from the tree stats. It's not part of the
- /// normal recipe.
- double UpdatePhoneVectorsCheckedFromClusterable(
- const std::vector<SgmmClusterable*> &stats,
- const std::vector<SpMatrix<double> > &H,
- AmSgmm *model);
-
- protected:
- friend class UpdateWParallelClass;
- friend class UpdatePhoneVectorsClass;
- friend class UpdatePhoneVectorsCheckedFromClusterableClass;
- friend class EbwEstimateAmSgmm;
-
- /// Compute the Q_i quantities (Eq. 64).
- static void ComputeQ(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- std::vector< SpMatrix<double> > *Q);
-
- /// Compute the S_means quantities, minus sum: (Y_i M_i^T + M_i Y_I^T).
- static void ComputeSMeans(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- std::vector< SpMatrix<double> > *S_means);
- friend class EbwAmSgmmUpdater;
- private:
- MleAmSgmmOptions update_options_;
- /// Q_{i}, quadratic term for phonetic subspace estimation. Dim is [I][S][S]
- std::vector< SpMatrix<double> > Q_;
-
- /// Eq (74): S_{i}^{(means)}, scatter of substate mean vectors for estimating
- /// the shared covariance matrices. [Actually this variable contains also the
- /// term -(Y_i M_i^T + M_i Y_I^T).] Dimension is [I][D][D].
- std::vector< SpMatrix<double> > S_means_;
-
- Vector<double> gamma_j_; ///< State occupancies
-
-
- void ComputeSmoothingTerms(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const std::vector< SpMatrix<double> > &H,
- SpMatrix<double> *H_sm,
- Vector<double> *y_sm) const;
-
- // UpdatePhoneVectors function that allows smoothing terms (but
- // no checking of proper auxiliary function RE weights)
- double UpdatePhoneVectors(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- const SpMatrix<double> &H_sm,
- const Vector<double> &y_sm);
-
-
- // Called from UpdatePhoneVectors; updates a subset of states
- // (relates to multi-threading).
- void UpdatePhoneVectorsInternal(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H,
- const SpMatrix<double> &H_sm,
- const Vector<double> &y_sm,
- double *auxf_impr,
- double *like_impr,
- int32 num_threads,
- int32 thread_id) const;
-
- // UpdatePhoneVectors function that does not support smoothing
- // terms, but allows checking of objective-function improvement,
- // and backtracking.
- double UpdatePhoneVectorsChecked(const MleAmSgmmAccs &accs,
- AmSgmm *model,
- const std::vector<SpMatrix<double> > &H);
-
- // Called (indirectly) from UpdatePhoneVectorsCheckedFromClusterable()
- void UpdatePhoneVectorsCheckedFromClusterableInternal(
- const std::vector<SgmmClusterable*> &stats,
- const std::vector< SpMatrix<double> > &H,
- AmSgmm *model,
- double *count_ptr,
- double *like_impr_ptr,
- int32 num_threads,
- int32 thread_id);
-
- double UpdateM(const MleAmSgmmAccs &accs, AmSgmm *model);
-
- void RenormalizeV(const MleAmSgmmAccs &accs, AmSgmm *model,
- const SpMatrix<double> &H_sm);
- double UpdateN(const MleAmSgmmAccs &accs, AmSgmm *model);
- void RenormalizeN(const MleAmSgmmAccs &accs, AmSgmm *model);
- double UpdateVars(const MleAmSgmmAccs &accs, AmSgmm *model);
- double UpdateWParallel(const MleAmSgmmAccs &accs, AmSgmm *model);
-
- /// Called, multithreaded, inside UpdateWParallel
- static
- void UpdateWParallelGetStats(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const Matrix<double> &w,
- Matrix<double> *F_i,
- Matrix<double> *g_i,
- double *tot_like,
- int32 num_threads,
- int32 thread_id);
-
- double UpdateWSequential(const MleAmSgmmAccs &accs,
- AmSgmm *model);
- double UpdateSubstateWeights(const MleAmSgmmAccs &accs,
- AmSgmm *model);
-
- void ComputeMPrior(AmSgmm *model); // TODO(arnab): Maybe make this static?
- double MapUpdateM(const MleAmSgmmAccs &accs, AmSgmm *model);
-
- KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmUpdater);
- MleAmSgmmUpdater() {} // Prevent unconfigured updater.
-};
-
-
-/** \class MleSgmmSpeakerAccs
- * Class for the accumulators required to update the speaker
- * vectors v_s.
- * Note: if you have multiple speakers you will want to initialize
- * this just once and call Clear() after you're done with each speaker,
- * rather than creating a new object for each speaker, since the
- * initialization function does nontrivial work.
- */
-
-class MleSgmmSpeakerAccs {
- public:
- /// Initialize the object. Error if speaker subspace not set up.
- MleSgmmSpeakerAccs(const AmSgmm &model, BaseFloat rand_prune_ = 1.0e-05);
-
- /// Clear the statistics.
- void Clear();
-
- /// Accumulate statistics. Returns per-frame log-likelihood.
- BaseFloat Accumulate(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- int32 state_index, BaseFloat weight);
-
- /// Accumulate statistics, given posteriors. Returns total
- /// count accumulated, which may differ from posteriors.Sum()
- /// due to randomized pruning.
- BaseFloat AccumulateFromPosteriors(const AmSgmm &model,
- const SgmmPerFrameDerivedVars &frame_vars,
- const Matrix<BaseFloat> &posteriors,
- int32 state_index);
-
- /// Update speaker vector. If v_s was empty, will assume it started as zero
- /// and will resize it to the speaker-subspace size.
- void Update(BaseFloat min_count, // e.g. 100
- Vector<BaseFloat> *v_s,
- BaseFloat *objf_impr_out,
- BaseFloat *count_out);
-
- private:
- /// Statistics for speaker adaptation (vectors), stored per-speaker.
- /// Per-speaker stats for vectors, y^{(s)}. Dimension [T].
- Vector<double> y_s_;
- /// gamma_{i}^{(s)}. Per-speaker counts for each Gaussian. Dimension is [I]
- Vector<double> gamma_s_;
-
- /// The following variable does not change per speaker.
- /// Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
- std::vector< SpMatrix<double> > H_spk_;
-
- /// N_i^T \Sigma_{i}^{-1}. Needed for y^{(s)}
- std::vector< Matrix<double> > NtransSigmaInv_;
-
- /// small constant to randomly prune tiny posteriors
- BaseFloat rand_prune_;
-};
-
-// This class, used in multi-core implementation of the updates of the "w_i"
-// quantities, was previously in estimate-am-sgmm.cc, but is being moved to the
-// header so it can be used in estimate-am-sgmm-ebw.cc. It is responsible for
-// computing, in parallel, the F_i and g_i quantities used in the updates of
-// w_i.
-class UpdateWParallelClass: public MultiThreadable {
- public:
- UpdateWParallelClass(const MleAmSgmmAccs &accs,
- const AmSgmm &model,
- const Matrix<double> &w,
- Matrix<double> *F_i,
- Matrix<double> *g_i,
- double *tot_like):
- accs_(accs), model_(model), w_(w),
- F_i_ptr_(F_i), g_i_ptr_(g_i), tot_like_ptr_(tot_like) {
- tot_like_ = 0.0;
- F_i_.Resize(F_i->NumRows(), F_i->NumCols());
- g_i_.Resize(g_i->NumRows(), g_i->NumCols());
- }
-
- ~UpdateWParallelClass() {
- F_i_ptr_->AddMat(1.0, F_i_, kNoTrans);
- g_i_ptr_->AddMat(1.0, g_i_, kNoTrans);
- *tot_like_ptr_ += tot_like_;
- }
-
- inline void operator() () {
- // Note: give them local copy of the sums we're computing,
- // which will be propagated to the total sums in the destructor.
- MleAmSgmmUpdater::UpdateWParallelGetStats(accs_, model_, w_,
- &F_i_, &g_i_, &tot_like_,
- num_threads_, thread_id_);
- }
- private:
- // MleAmSgmmUpdater *updater_;
- const MleAmSgmmAccs &accs_;
- const AmSgmm &model_;
- const Matrix<double> &w_;
- Matrix<double> *F_i_ptr_;
- Matrix<double> *g_i_ptr_;
- Matrix<double> F_i_;
- Matrix<double> g_i_;
- double *tot_like_ptr_;
- double tot_like_;
-};
-
-
-} // namespace kaldi
-
-
-#endif // KALDI_SGMM_ESTIMATE_AM_SGMM_H_
diff --git a/src/sgmm/fmllr-sgmm-test.cc b/src/sgmm/fmllr-sgmm-test.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-// sgmm/fmllr-sgmm-test.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "base/kaldi-math.h"
-#include "gmm/model-test-common.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/fmllr-sgmm.h"
-#include "util/kaldi-io.h"
-
-using kaldi::AmSgmm;
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::Vector;
-using kaldi::Matrix;
-using kaldi::Exp;
-
-namespace ut = kaldi::unittest;
-
-void ApplyFmllrXform(const kaldi::VectorBase<BaseFloat> &in,
- const Matrix<BaseFloat> &xf,
- Vector<BaseFloat> *out) {
- int32 dim = in.Dim();
- KALDI_ASSERT(xf.NumRows() == dim && xf.NumCols() == dim + 1);
- Vector<BaseFloat> tmp(dim + 1);
- tmp.Range(0, dim).CopyFromVec(in);
- tmp(dim) = 1.0;
- out->Resize(dim, kaldi::kSetZero);
- out->AddMatVec(1.0, xf, kaldi::kNoTrans, tmp, 0.0);
-}
-
-// Tests the Read() and Write() methods for the accumulators, in both binary
-// and ASCII mode, as well as Check().
-void TestSgmmFmllrAccsIO(const AmSgmm &sgmm,
- const kaldi::Matrix<BaseFloat> &feats) {
- KALDI_LOG << "Test IO start.";
- using namespace kaldi;
- int32 dim = sgmm.FeatureDim();
- kaldi::SgmmPerFrameDerivedVars frame_vars;
- kaldi::SgmmPerSpkDerivedVars empty;
- kaldi::SgmmFmllrGlobalParams fmllr_globals;
- kaldi::SgmmGselectConfig sgmm_config;
-
- frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim());
- sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
- sgmm.NumGauss());
- kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
- occs.Set(feats.NumRows());
- sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
- &fmllr_globals.inv_xform_,
- &fmllr_globals.mean_scatter_);
- if (fmllr_globals.mean_scatter_.Min() == 0.0) {
- KALDI_WARN << "Global covariances low rank!";
- KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_;
- return;
- }
-
-// std::cout << "Pre-Xform = " << fmllr_globals.pre_xform_;
-// std::cout << "Inv-Xform = " << fmllr_globals.inv_xform_;
-
- FmllrSgmmAccs accs;
- accs.Init(sgmm.FeatureDim(), sgmm.NumGauss());
- BaseFloat loglike = 0.0;
- Vector<BaseFloat> empty_spk;
- std::vector<int32> gselect;
- for (int32 i = 0; i < feats.NumRows(); i++) {
- sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
- sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, 0.0, &frame_vars);
- loglike += accs.Accumulate(sgmm, empty, feats.Row(i), frame_vars, 0, 1.0);
- }
-
- kaldi::SgmmFmllrConfig update_opts;
-// update_opts.fmllr_min_count = 100;
- kaldi::Matrix<BaseFloat> xform_mat(dim, dim+1);
- xform_mat.SetUnit();
- BaseFloat frames, impr;
- accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, &frames, &impr);
-
- Vector<BaseFloat> xformed_feat(dim);
- ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
- sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
- sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, 0.0, &frame_vars);
- BaseFloat loglike1 = sgmm.LogLikelihood(frame_vars, 0);
-
- bool binary_in;
- // First, non-binary write
- KALDI_LOG << "Test ASCII IO.";
- accs.Write(kaldi::Output("tmpf", false).Stream(), false);
- FmllrSgmmAccs *accs1 = new FmllrSgmmAccs();
- // Non-binary read
- kaldi::Input ki1("tmpf", &binary_in);
- accs1->Read(ki1.Stream(), binary_in, false);
- xform_mat.SetUnit();
- accs1->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
- ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
- sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
- sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, 0.0, &frame_vars);
- BaseFloat loglike2 = sgmm.LogLikelihood(frame_vars, 0);
- std::cout << "LL1 = " << loglike1 << ", LL2 = " << loglike2 << std::endl;
- kaldi::AssertEqual(loglike1, loglike2, 1e-2);
- delete accs1;
-
- // Next, binary write
- KALDI_LOG << "Test Binary IO.";
- accs.Write(kaldi::Output("tmpfb", true).Stream(), true);
- FmllrSgmmAccs *accs2 = new FmllrSgmmAccs();
- // Binary read
- kaldi::Input ki2("tmpfb", &binary_in);
- accs2->Read(ki2.Stream(), binary_in, false);
- xform_mat.SetUnit();
- accs2->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
- ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat);
- sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect);
- sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, 0.0, &frame_vars);
- BaseFloat loglike3 = sgmm.LogLikelihood(frame_vars, 0);
- std::cout << "LL1 = " << loglike1 << ", LL3 = " << loglike3 << std::endl;
- kaldi::AssertEqual(loglike1, loglike3, 1e-4);
- delete accs2;
- KALDI_LOG << "Test IO end.";
-
- unlink("tmpf");
- unlink("tmpfb");
-}
-
-void TestSgmmFmllrSubspace(const AmSgmm &sgmm,
- const kaldi::Matrix<BaseFloat> &feats) {
- KALDI_LOG << "Test Subspace start.";
- using namespace kaldi;
- int32 dim = sgmm.FeatureDim();
- kaldi::SgmmPerFrameDerivedVars frame_vars;
- kaldi::SgmmPerSpkDerivedVars empty;
- kaldi::SgmmFmllrGlobalParams fmllr_globals;
- kaldi::SgmmGselectConfig sgmm_config;
-
- frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim());
- sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest,
- sgmm.NumGauss());
- kaldi::Vector<BaseFloat> occs(sgmm.NumPdfs());
- occs.Set(feats.NumRows());
- sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
- &fmllr_globals.inv_xform_,
- &fmllr_globals.mean_scatter_);
- if (fmllr_globals.mean_scatter_.Min() == 0.0) {
- KALDI_WARN << "Global covariances low rank!";
- KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_;
- return;
- }
-
- FmllrSgmmAccs accs;
- accs.Init(sgmm.FeatureDim(), sgmm.NumGauss());
- BaseFloat loglike = 0.0;
- Vector<BaseFloat> empty_spk;
- std::vector<int32> gselect;
- for (int32 i = 0; i < feats.NumRows(); i++) {
- sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect);
- sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, 0.0, &frame_vars);
- loglike += accs.Accumulate(sgmm, empty, feats.Row(i), frame_vars, 0, 1.0);
- }
-
- SpMatrix<double> grad_scatter(dim * (dim+1));
- accs.AccumulateForFmllrSubspace(sgmm, fmllr_globals, &grad_scatter);
- kaldi::SgmmFmllrConfig update_opts;
- EstimateSgmmFmllrSubspace(grad_scatter, update_opts.num_fmllr_bases, dim,
- &fmllr_globals);
-// update_opts.fmllr_min_count = 100;
- kaldi::Matrix<BaseFloat> xform_mat(dim, dim+1);
- xform_mat.SetUnit();
- accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL);
- KALDI_LOG << "Test Subspace end.";
-}
-
-void TestSgmmFmllr() {
- // srand(time(NULL));
- int32 dim = 1 + kaldi::RandInt(0, 9); // random dimension of the gmm
- int32 num_comp = 2 + kaldi::RandInt(0, 9); // random number of mixtures
- kaldi::FullGmm full_gmm;
- ut::InitRandFullGmm(dim, num_comp, &full_gmm);
-
- int32 num_states = 1;
- AmSgmm sgmm;
- kaldi::SgmmGselectConfig config;
- sgmm.InitializeFromFullGmm(full_gmm, num_states, dim+1, dim);
- sgmm.ComputeNormalizers();
-
- kaldi::Matrix<BaseFloat> feats;
-
- { // First, generate random means and variances
- int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2);
- kaldi::Matrix<BaseFloat> means(num_feat_comp, dim),
- vars(num_feat_comp, dim);
- for (int32 m = 0; m < num_feat_comp; m++) {
- for (int32 d= 0; d < dim; d++) {
- means(m, d) = kaldi::RandGauss();
- vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2;
- }
- }
- // Now generate random features with those means and variances.
- feats.Resize(num_feat_comp * 200, dim);
- for (int32 m = 0; m < num_feat_comp; m++) {
- kaldi::SubMatrix<BaseFloat> tmp(feats, m*200, 200, 0, dim);
- ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp);
- }
- }
- TestSgmmFmllrAccsIO(sgmm, feats);
- TestSgmmFmllrSubspace(sgmm, feats);
-}
-
-int main() {
- std::srand(1000);
- kaldi::g_kaldi_verbose_level = 5;
- for (int i = 0; i < 10; i++)
- TestSgmmFmllr();
- std::cout << "Test OK.\n";
- return 0;
-}
diff --git a/src/sgmm/fmllr-sgmm.cc b/src/sgmm/fmllr-sgmm.cc
--- a/src/sgmm/fmllr-sgmm.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-// sgmm/fmllr-sgmm.cc
-
-// Copyright 2009-2011 Saarland University (author: Arnab Ghoshal)
-// 2012 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-using std::vector;
-
-#include "sgmm/fmllr-sgmm.h"
-#include "util/parse-options.h"
-
-namespace kaldi {
-
-static void ApplyPreXformToGradient(const SgmmFmllrGlobalParams &globals,
- const Matrix<BaseFloat> &gradient_in,
- Matrix<BaseFloat> *gradient_out) {
- // Eq. (B.14): P' = A_{inv}^T P {W_{pre}^+}^T
- int32 dim = gradient_in.NumRows();
- Matrix<BaseFloat> Wpre_plus(dim + 1, dim + 1, kSetZero);
- Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_);
- Wpre_plus(dim, dim) = 1;
- SubMatrix<BaseFloat> Ainv(globals.inv_xform_, 0, dim, 0, dim);
- Matrix<BaseFloat> AinvP(dim, dim + 1, kUndefined);
- AinvP.AddMatMat(1.0, Ainv, kTrans, gradient_in, kNoTrans, 0.0);
- gradient_out->AddMatMat(1.0, AinvP, kNoTrans, Wpre_plus, kTrans, 0.0);
-}
-
-static void ApplyInvPreXformToChange(const SgmmFmllrGlobalParams &globals,
- const Matrix<BaseFloat> &delta_in,
- Matrix<BaseFloat> *delta_out) {
- // Eq. (B.25): \Delta = A_{inv} \Delta' W_{pre}^+
- int32 dim = delta_in.NumRows();
- Matrix<BaseFloat> Wpre_plus(dim + 1, dim + 1, kSetZero);
- Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_);
- Wpre_plus(dim, dim) = 1;
- SubMatrix<BaseFloat> Ainv(globals.inv_xform_, 0, dim, 0, dim);
- Matrix<BaseFloat> AinvD(dim, dim + 1, kUndefined);
- AinvD.AddMatMat(1.0, Ainv, kNoTrans, delta_in, kNoTrans, 0.0);
- delta_out->AddMatMat(1.0, AinvD, kNoTrans, Wpre_plus, kNoTrans, 0.0);
-}
-
-static void ApplyHessianXformToGradient(const SgmmFmllrGlobalParams &globals,
- const Matrix<BaseFloat> &gradient_in,
- Matrix<BaseFloat> *gradient_out) {
- int32 dim = gradient_in.NumRows();
- const Vector<BaseFloat> &D = globals.mean_scatter_;
- if (D.Min() <= 0.0)
- KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues.";
- for (int32 r = 0; r < dim; r++) {
- for (int32 c = 0; c < r; c++) {
- // Eq. (B.15)
- (*gradient_out)(r, c) = gradient_in(r, c) / std::sqrt(1 + D(c));
- // Eq. (B.16)
- (*gradient_out)(c, r) = gradient_in(c, r) / std::sqrt(1 + D(r) -
- 1 / (1 + D(c))) - gradient_in(r, c) / ((1 + D(c)) *
- std::sqrt(1 + D(r) - 1 / (1 + D(c))));
- }
- // Eq. (B.17) & (B.18)
- (*gradient_out)(r, r) = gradient_in(r, r) / std::sqrt(2 + D(r));
- (*gradient_out)(r, dim) = gradient_in(r, dim);
- }
-}
-
-static void ApplyInvHessianXformToChange(const SgmmFmllrGlobalParams &globals,
- const Matrix<BaseFloat> &delta_in,
- Matrix<BaseFloat> *delta_out) {
- int32 dim = delta_in.NumRows();
- const Vector<BaseFloat> &D = globals.mean_scatter_;
- if (D.Min() <= 0.0)
- KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues.";
- for (int32 r = 0; r < dim; r++) {
- for (int32 c = 0; c < r; c++) {
- // Eq. (B.21)
- (*delta_out)(r, c) = delta_in(r, c) / std::sqrt(1 + D(c)) -
- delta_in(c, r) / ((1 + D(c)) * std::sqrt(1 + D(r) - 1 / (1 + D(c))));
- // Eq. (B.22)
- (*delta_out)(c, r) = delta_in(c, r) / std::sqrt(1 + D(r) - 1/ (1 + D(c)));
- }
- // Eq. (B.23) & (B.24)
- (*delta_out)(r, r) = delta_in(r, r) / std::sqrt(2 + D(r));
- (*delta_out)(r, dim) = delta_in(r, dim);
- }
-}
-
-
-void SgmmFmllrGlobalParams::Write(std::ostream &out, bool binary) const {
- WriteToken(out, binary, "<SGMM_FMLLR_GLOBAL_PARAMS>");
- WriteToken(out, binary, "<PRE_XFORM>");
- pre_xform_.Write(out, binary);
- WriteToken(out, binary, "<INV_XFORM>");
- inv_xform_.Write(out, binary);
- WriteToken(out, binary, "<MEAN_SCATTER>");
- mean_scatter_.Write(out, binary);
- if (fmllr_bases_.size() != 0) {
- WriteToken(out, binary, "<FMLLR_BASIS>");
- uint32 tmp = static_cast<uint32>(fmllr_bases_.size());
- WriteBasicType(out, binary, tmp);
- for (uint32 i = 0; i < tmp; i++) {
- fmllr_bases_[i].Write(out, binary);
- }
- }
- WriteToken(out, binary, "</SGMM_FMLLR_GLOBAL_PARAMS>");
-}
-
-void SgmmFmllrGlobalParams::Read(std::istream &in, bool binary) {
- ExpectToken(in, binary, "<SGMM_FMLLR_GLOBAL_PARAMS>");
- ExpectToken(in, binary, "<PRE_XFORM>");
- pre_xform_.Read(in, binary);
- ExpectToken(in, binary, "<INV_XFORM>");
- inv_xform_.Read(in, binary);
- ExpectToken(in, binary, "<MEAN_SCATTER>");
- mean_scatter_.Read(in, binary);
- std::string token;
- ReadToken(in, binary, &token);
- if (token == "<FMLLR_BASIS>") {
- uint32 tmp;
- ReadBasicType(in, binary, &tmp);
- fmllr_bases_.resize(tmp);
- for (uint32 i = 0; i < tmp; i++) {
- fmllr_bases_[i].Read(in, binary);
- }
- } else {
- if (token != "</SGMM_FMLLR_GLOBAL_PARAMS>")
- KALDI_ERR << "Unexpected token '" << token << "' found.";
- }
-}
-
-
-void FmllrSgmmAccs::Init(int32 dim, int32 num_gaussians) {
- if (dim == 0) { // empty stats
- dim_ = 0; // non-zero dimension is meaningless in empty stats
- stats_.Init(0, 0); // clear the stats
- } else {
- dim_ = dim;
- stats_.Init(dim, num_gaussians);
- }
-}
-
-BaseFloat FmllrSgmmAccs::Accumulate(const AmSgmm &model,
- const SgmmPerSpkDerivedVars &spk,
- const VectorBase<BaseFloat> &data,
- const SgmmPerFrameDerivedVars &frame_vars,
- int32 pdf_index, BaseFloat weight) {
- // Calulate Gaussian posteriors and collect statistics
- Matrix<BaseFloat> posteriors;
- BaseFloat log_like = model.ComponentPosteriors(frame_vars, pdf_index,
- &posteriors);
- posteriors.Scale(weight);
- AccumulateFromPosteriors(model, spk, data, frame_vars.gselect, posteriors,
- pdf_index);
- return log_like;
-}
-
-void
-FmllrSgmmAccs::AccumulateFromPosteriors(const AmSgmm &model,
- const SgmmPerSpkDerivedVars &spk,
- const VectorBase<BaseFloat> &data,
- const vector<int32> &gselect,
- const Matrix<BaseFloat> &posteriors,
- int32 pdf_index) {
- Vector<double> var_scaled_mean(dim_), extended_data(dim_+1);
- extended_data.Range(0, dim_).CopyFromVec(data);
- extended_data(dim_) = 1.0;
- SpMatrix<double> scatter(dim_+1, kSetZero);
- scatter.AddVec2(1.0, extended_data);
-
- for (int32 ki = 0, ki_max = gselect.size(); ki < ki_max; ki++) {
- int32 i = gselect[ki];
-
- for (int32 m = 0; m < model.NumSubstates(pdf_index); m++) {
- // posterior gamma_{jkmi}(t) eq.(39)
- BaseFloat gammat_jmi = posteriors(ki, m);
-
- // Accumulate statistics for non-zero gaussian posterior
- if (gammat_jmi > 0.0) {
- stats_.beta_ += gammat_jmi;
- model.GetVarScaledSubstateSpeakerMean(pdf_index, m, i, spk,
- &var_scaled_mean);
- // Eq. (52): K += \gamma_{jmi} \Sigma_{i}^{-1} \mu_{jmi}^{(s)} x^{+T}
- stats_.K_.AddVecVec(gammat_jmi, var_scaled_mean, extended_data);
- // Eq. (53): G_{i} += \gamma_{jmi} x^{+} x^{+T}
- stats_.G_[i].AddSp(gammat_jmi, scatter);
- } // non-zero posteriors
- } // loop over substates
- } // loop over selected Gaussians
-}
-
-void FmllrSgmmAccs::AccumulateForFmllrSubspace(const AmSgmm &sgmm,
- const SgmmFmllrGlobalParams &globals, SpMatrix<double> *grad_scatter) {
- if (stats_.beta_ <= 0.0) {
- KALDI_WARN << "Not committing any stats since no stats accumulated.";
- return;
- }
- int32 dim = sgmm.FeatureDim();
- Matrix<BaseFloat> xform(dim, dim + 1, kUndefined);
- xform.SetUnit();
- Matrix<BaseFloat> grad(dim, dim + 1, kSetZero);
- this->FmllrObjGradient(sgmm, xform, &grad, NULL);
- Matrix<BaseFloat> pre_xformed_grad(dim, dim + 1, kSetZero);
- ApplyPreXformToGradient(globals, grad, &pre_xformed_grad);
- Matrix<BaseFloat> hess_xformed_grad(dim, dim + 1, kSetZero);
- ApplyHessianXformToGradient(globals, pre_xformed_grad, &hess_xformed_grad);
- Vector<double> grad_vec(dim * (dim + 1));
- grad_vec.CopyRowsFromMat(hess_xformed_grad);
- grad_vec.Scale(1 / std::sqrt(stats_.beta_));
- grad_scatter->AddVec2(1.0, grad_vec);
- KALDI_LOG << "Frame counts for when committing fMLLR subspace stats are "
- << stats_.beta_;
-}
-
-
-BaseFloat FmllrSgmmAccs::FmllrObjGradient(const AmSgmm &sgmm,
- const Matrix<BaseFloat> &xform,
- Matrix<BaseFloat> *grad_out,
- Matrix<BaseFloat> *G_out) const {
- int32 dim = sgmm.FeatureDim(),
- num_gauss = sgmm.NumGauss();
- KALDI_ASSERT(stats_.G_.size() == static_cast<size_t>(num_gauss));
- Matrix<double> xform_d(xform);
- SubMatrix<double> A(xform_d, 0, dim, 0, dim);
- Matrix<double> xform_g(dim, dim + 1), total_g(dim, dim + 1);
- SpMatrix<double> inv_covar(dim);
- double obj = stats_.beta_ * A.LogDet() +
- TraceMatMat(xform_d, stats_.K_, kTrans);
- for (int32 i = 0; i < num_gauss; i++) {
- sgmm.GetInvCovars(i, &inv_covar);
- xform_g.AddMatSp(1.0, xform_d, kNoTrans, stats_.G_[i], 0.0);
- total_g.AddSpMat(1.0, inv_covar, xform_g, kNoTrans, 1.0);
- }
- obj -= 0.5 * TraceMatMat(xform_d, total_g, kTrans);
- if (G_out != NULL) G_out->CopyFromMat(total_g);
-
- // Compute the gradient: P = \beta [(A^{-1})^{T} , 0] + K - S
- if (grad_out != NULL) {
- Matrix<double> grad_d(dim, dim + 1, kSetZero);
- grad_d.Range(0, dim, 0, dim).CopyFromMat(A);
- grad_d.Range(0, dim, 0, dim).InvertDouble();
- grad_d.Range(0, dim, 0, dim).Transpose();
- grad_d.Scale(stats_.beta_);
- grad_d.AddMat(-1.0, total_g, kNoTrans);
- grad_d.AddMat(1.0, stats_.K_, kNoTrans);
- grad_out->CopyFromMat(grad_d);
- }
-
- return obj;
-}
-
-
-void FmllrSgmmAccs::Write(std::ostream &out, bool binary) const {
- WriteToken(out, binary, "<FMLLRACCS>");
- WriteToken(out, binary, "<DIMENSION>");
- WriteBasicType(out, binary, dim_);
- WriteToken(out, binary, "<STATS>");
- stats_.Write(out, binary);
- WriteToken(out, binary, "</FMLLRACCS>");
-}
-
-void FmllrSgmmAccs::Read(std::istream &in, bool binary, bool add) {
- ExpectToken(in, binary, "<FMLLRACCS>");
- ExpectToken(in, binary, "<DIMENSION>");
- ReadBasicType(in, binary, &dim_);
- KALDI_ASSERT(dim_ > 0);
- ExpectToken(in, binary, "<STATS>");
- stats_.Read(in, binary, add);
- ExpectToken(in, binary, "</FMLLRACCS>");
-}
-
-
-static BaseFloat CalcFmllrStepSize(const AffineXformStats &stats,
- const AmSgmm &sgmm,
- const MatrixBase<BaseFloat> &Delta,
- const MatrixBase<BaseFloat> &A,
- const Matrix<BaseFloat> &G,
- int32 max_iters) {
- int32 dim = sgmm.FeatureDim();
- Matrix<double> Delta_d(Delta);
- Matrix<double> G_d(G);
- SubMatrix<double> Delta_C(Delta_d, 0, dim, 0, dim);
-
- // Eq. (B.28): m = tr(\Delta K^T) - tr(\Delta S^T)
- BaseFloat m = TraceMatMat(Delta_d, stats.K_, kTrans)
- - TraceMatMat(Delta_d, G_d, kTrans);
- // Eq. (B.29): n = \sum_i tr(\Delta \Sigma_{i}^{-1} \Delta S_{i})
- BaseFloat n = 0;
- SpMatrix<double> inv_covar;
- for (int32 i = 0, num_gauss = sgmm.NumGauss(); i < num_gauss; i++) {
- sgmm.GetInvCovars(i, &inv_covar);
- n += TraceMatSpMatSp(Delta_d, kTrans, inv_covar, Delta_d, kNoTrans,
- stats.G_[i]);
- }
-
- BaseFloat step_size = 0.0;
- // initialize just to get rid of compile errors.
- BaseFloat obj_step_old, obj_step_new = 0.0;
- Matrix<double> new_A(dim, dim);
- Matrix<double> B(dim, dim);
- for (int32 iter_step = 0; iter_step < max_iters; iter_step++) {
- if (iter_step == 0) {
- obj_step_old = stats.beta_ * A.LogDet(); // Q_0 = \beta * log det(A)
- } else {
- obj_step_old = obj_step_new;
- }
-
- // Eq. (B.30); B = (A + k\Delta^{-C})^{-1} \Delta^{-C}
- new_A.CopyFromMat(A);
- new_A.AddMat(step_size, Delta_C, kNoTrans);
- new_A.InvertDouble();
- B.AddMatMat(1.0, new_A, kNoTrans, Delta_C, kNoTrans, 0.0);
-
- BaseFloat d = m - step_size * n + stats.beta_ * TraceMat(B);
- BaseFloat d2 = -n - stats.beta_ * TraceMatMat(B, B, kNoTrans);
- if (std::fabs(d / d2) < 0.000001) { break; } // converged
-
- BaseFloat step_size_change = -(d / d2);
- step_size += step_size_change; // Eq. (B.33)
-
- // Halve step size when the auxiliary function decreases.
- do {
- new_A.CopyFromMat(A);
- new_A.AddMat(step_size, Delta_C, kNoTrans);
- BaseFloat logdet = new_A.LogDet();
- obj_step_new = stats.beta_ * logdet + step_size * m -
- 0.5 * step_size * step_size * n;
-
- if (obj_step_new - obj_step_old < -0.001) {
- KALDI_WARN << "Objective function decreased (" << obj_step_old << "->"
- << obj_step_new << "). Halving step size change ("
- << step_size << " -> " << (step_size - (step_size_change/2))
- << ")";
- step_size_change /= 2;
- step_size -= step_size_change; // take away half of our step
- } // Facing numeric precision issues. Compute in double?
- } while (obj_step_new - obj_step_old < -0.001 && step_size_change > 1e-05);
- }
- return step_size;
-}
-
-
-bool FmllrSgmmAccs::Update(const AmSgmm &sgmm,
- const SgmmFmllrGlobalParams &globals,
- const SgmmFmllrConfig &opts,
- Matrix<BaseFloat> *out_xform,
- BaseFloat *frame_count, BaseFloat *auxf_out) const {
- BaseFloat auxf_improv = 0.0, logdet = 0.0;
- KALDI_ASSERT(out_xform->NumRows() == dim_ && out_xform->NumCols() == dim_+1);
- BaseFloat mincount = (globals.HasBasis() ?
- std::min(opts.fmllr_min_count_basis, opts.fmllr_min_count_full) :
- opts.fmllr_min_count);
- bool using_subspace = (globals.HasBasis() ?
- (stats_.beta_ < opts.fmllr_min_count_full) : false);
-
- if (globals.IsEmpty())
- KALDI_ERR << "Must set up pre-transforms before estimating FMLLR.";
-
- KALDI_VLOG(1) << "Mincount = " << mincount << "; Basis: "
- << std::string(globals.HasBasis()? "yes; " : "no; ")
- << "Using subspace: " << std::string(using_subspace? "yes; "
- : "no; ");
-
- int32 num_bases = 0;
- if (using_subspace) {
- KALDI_ASSERT(globals.fmllr_bases_.size() != 0);
- int32 max_bases = std::min(static_cast<int32>(globals.fmllr_bases_.size()),
- opts.num_fmllr_bases);
- num_bases = (opts.bases_occ_scale <= 0.0)? max_bases :
- std::min(max_bases, static_cast<int32>(std::floor(opts.bases_occ_scale
- * stats_.beta_)));
- KALDI_VLOG(1) << "Have " << stats_.beta_ << " frames for speaker: Using "
- << num_bases << " fMLLR bases.";
- }
-
- // initialization just to get rid of compile errors.
- BaseFloat auxf_old = 0, auxf_new = 0;
- if (frame_count != NULL) *frame_count = stats_.beta_;
-
- // If occupancy is greater than the min count, update the transform
- if (stats_.beta_ >= mincount) {
- for (int32 iter = 0; iter < opts.fmllr_iters; iter++) {
- Matrix<BaseFloat> grad(dim_, dim_ + 1, kSetZero);
- Matrix<BaseFloat> G(dim_, dim_ + 1, kSetZero);
- auxf_new = this->FmllrObjGradient(sgmm, *out_xform, &grad, &G);
-
- // For diagnostic purposes
- KALDI_VLOG(3) << "Iter " << iter << ": Auxiliary function = "
- << (auxf_new / stats_.beta_) << " per frame over " << stats_.beta_
- << " frames";
-
- if (iter > 0) {
- // For diagnostic purposes
- KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: "
- << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over "
- << (stats_.beta_) << " frames";
- auxf_improv += auxf_new - auxf_old;
- }
-
- Matrix<BaseFloat> pre_xformed_grad(dim_, dim_ + 1, kSetZero);
- ApplyPreXformToGradient(globals, grad, &pre_xformed_grad);
-// std::cout << "Pre-X Grad = " << pre_xformed_grad << std::endl;
-
- // Transform P_sk with the Hessian
- Matrix<BaseFloat> hess_xformed_grad(dim_, dim_ + 1, kSetZero);
- ApplyHessianXformToGradient(globals, pre_xformed_grad,
- &hess_xformed_grad);
-// std::cout << "Hess-X Grad = " << hess_xformed_grad << std::endl;
-
- // Update the actual FMLLR transform matrices
- Matrix<BaseFloat> hess_xformed_delta(dim_, dim_ + 1, kUndefined);
- if (using_subspace) {
- // Note that in this case we can simply store the speaker-specific
- // coefficients for each of the basis matrices. The current
- // implementation stores the computed transform to simplify the code!
- hess_xformed_delta.SetZero();
- for (int32 b = 0; b < num_bases; b++) { // Eq (B.20)
- hess_xformed_delta.AddMat(TraceMatMat(globals.fmllr_bases_[b],
- hess_xformed_grad, kTrans),
- globals.fmllr_bases_[b], kNoTrans);
- }
- hess_xformed_delta.Scale(1 / stats_.beta_);
- } else {
- hess_xformed_delta.CopyFromMat(hess_xformed_grad);
- hess_xformed_delta.Scale(1 / stats_.beta_); // Eq. (B.19)
- }
-
-// std::cout << "Hess-X Delta = " << hess_xformed_delta << std::endl;
-
- // Transform Delta with the Hessian
- Matrix<BaseFloat> pre_xformed_delta(dim_, dim_ + 1, kSetZero);
- ApplyInvHessianXformToChange(globals, hess_xformed_delta,
- &pre_xformed_delta);
-
- // Apply inverse pre-transform to Delta
- Matrix<BaseFloat> delta(dim_, dim_ + 1, kSetZero);
- ApplyInvPreXformToChange(globals, pre_xformed_delta, &delta);
-
-#ifdef KALDI_PARANOID
- // Check whether co-ordinate transformation is correct.
- {
- BaseFloat tr1 = TraceMatMat(delta, grad, kTrans);
- BaseFloat tr2 = TraceMatMat(pre_xformed_delta, pre_xformed_grad,
- kTrans);
- BaseFloat tr3 = TraceMatMat(hess_xformed_delta, hess_xformed_grad,
- kTrans);
- AssertEqual(tr1, tr2, 1e-5);
- AssertEqual(tr2, tr3, 1e-5);
- }
-#endif
-
- // Calculate the optimal step size
- SubMatrix<BaseFloat> A(*out_xform, 0, dim_, 0, dim_);
- BaseFloat step_size = CalcFmllrStepSize(stats_, sgmm, delta, A, G,
- opts.fmllr_iters);
-
- // Update: W <-- W + k \Delta Eq. (B.34)
- out_xform->AddMat(step_size, delta, kNoTrans);
- auxf_old = auxf_new;
-
- // Check the objective function change for the last iteration
- if (iter == opts.fmllr_iters - 1) {
- auxf_new = this->FmllrObjGradient(sgmm, *out_xform, NULL, NULL);
- logdet = A.LogDet();
- // SubMatrix A points to the memory location of out_xform, and so will
- // contain the updated value
-
- KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: "
- << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over "
- << (stats_.beta_) << " frames";
- auxf_improv += auxf_new - auxf_old;
- }
- }
- if (auxf_out != NULL) *auxf_out = auxf_improv;
- auxf_improv /= (stats_.beta_ + 1.0e-10);
-
- KALDI_LOG << "Auxiliary function improvement for FMLLR = " << auxf_improv
- << " per frame over " << stats_.beta_ << " frames. Log-determinant = "
- << logdet;
- return true;
- } else {
- KALDI_ASSERT(stats_.beta_ < mincount);
-// std::cerr.precision(10);
-// std::cerr.setf(std::ios::fixed,std::ios::floatfield);
- KALDI_WARN << "Not updating FMLLR because count is " << stats_.beta_
- << " < " << (mincount);
- if (auxf_out != NULL) *auxf_out = 0.0;
- return false;
- } // Do not use the transform if it does not have enough counts
- KALDI_ASSERT(false); // Should never be reached.
-}
-
-void EstimateSgmmFmllrSubspace(const SpMatrix<double> &fmllr_grad_scatter,
- int32 num_fmllr_bases, int32 feat_dim,
- SgmmFmllrGlobalParams *globals, double min_eig) {
- KALDI_ASSERT(num_fmllr_bases > 0 && feat_dim > 0);
- if (num_fmllr_bases > feat_dim * (feat_dim + 1)) {
- num_fmllr_bases = feat_dim * (feat_dim + 1);
- KALDI_WARN << "Limiting number of fMLLR bases to be the same as transform "
- << "dimension.";
- }
-
- vector< Matrix<BaseFloat> > &fmllr_bases(globals->fmllr_bases_);
-
- Vector<double> s(fmllr_grad_scatter.NumRows());
- Matrix<double> U(fmllr_grad_scatter.NumRows(),
- fmllr_grad_scatter.NumRows());
- try {
- fmllr_grad_scatter.Eig(&s, &U);
- SortSvd(&s, &U); // in case was not exactly sorted.
- KALDI_VLOG(1) << "Eigenvalues (max 200) of CMLLR scatter are: "
- << (SubVector<double>(s, 0,
- std::min(static_cast<MatrixIndexT>(200),
- s.Dim())));
-
-// for (int32 b = 2; b < num_fmllr_bases; b++) {
-// if (s(b) < min_eig) {
-// num_fmllr_bases = b;
-// KALDI_WARN << "Limiting number of fMLLR bases to " << num_fmllr_bases
-// << " because of small eigenvalues.";
-// break;
-// }
-// }
-
- U.Transpose(); // Now the rows of U correspond to the basis vectors.
- fmllr_bases.resize(num_fmllr_bases);
- for (int32 b = 0; b < num_fmllr_bases; b++) {
- fmllr_bases[b].Resize(feat_dim, feat_dim + 1, kSetZero);
- fmllr_bases[b].CopyRowsFromVec(U.Row(b));
- }
- KALDI_LOG << "Estimated " << num_fmllr_bases << " fMLLR basis matrices.";
- } catch(const std::exception &e) {
- KALDI_WARN << "Not estimating FMLLR bases because of a thrown exception:\n"
- << e.what();
- fmllr_bases.resize(0);
- }
-} // End of EstimateSgmmFmllrSubspace
-
-
-} // namespace kaldi
-
diff --git a/src/sgmm/fmllr-sgmm.h b/src/sgmm/fmllr-sgmm.h
--- a/src/sgmm/fmllr-sgmm.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// sgmm/fmllr-sgmm.h
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_SGMM_FMLLR_SGMM_H_
-#define KALDI_SGMM_FMLLR_SGMM_H_
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "sgmm/am-sgmm.h"
-#include "transform/transform-common.h"
-#include "util/kaldi-table.h"
-#include "util/kaldi-holder.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-
-/** \struct SgmmFmllrConfig
- * Configuration variables needed in the estimation of FMLLR for SGMMs.
- */
-struct SgmmFmllrConfig {
- int32 fmllr_iters; ///< Number of iterations in FMLLR estimation.
- int32 step_iters; ///< Iterations to find optimal FMLLR step size.
- /// Minimum occupancy count to estimate FMLLR using basis matrices.
- BaseFloat fmllr_min_count_basis;
- /// Minimum occupancy count to estimate FMLLR without basis matrices.
- BaseFloat fmllr_min_count;
- /// Minimum occupancy count to stop using FMLLR bases and switch to
- /// regular FMLLR estimation.
- BaseFloat fmllr_min_count_full;
- /// Number of basis matrices to use for FMLLR estimation. Can only *reduce*
- /// the number of bases present. Overridden by the 'bases_occ_scale' option.
- int32 num_fmllr_bases;
- /// Scale per-speaker count to determine number of CMLLR bases.
- BaseFloat bases_occ_scale;
-
- SgmmFmllrConfig() {
- fmllr_iters = 5;
- step_iters = 10;
- fmllr_min_count_basis = 100.0;
- fmllr_min_count = 1000.0;
- fmllr_min_count_full = 5000.0;
- num_fmllr_bases = 50;
- bases_occ_scale = 0.2;
- }
-
- void Register(OptionsItf *opts);
-};
-
-inline void SgmmFmllrConfig::Register(OptionsItf *opts) {
- std::string module = "SgmmFmllrConfig: ";
- opts->Register("fmllr-iters", &fmllr_iters, module+
- "Number of iterations in FMLLR estimation.");
- opts->Register("fmllr-step-iters", &step_iters, module+
- "Number of iterations to find optimal FMLLR step size.");
- opts->Register("fmllr-min-count-bases", &fmllr_min_count_basis, module+
- "Minimum occupancy count to estimate FMLLR using basis matrices.");
- opts->Register("fmllr-min-count", &fmllr_min_count, module+
- "Minimum occupancy count to estimate FMLLR (without bases).");
- opts->Register("fmllr-min-count-full", &fmllr_min_count_full, module+
- "Minimum occupancy count to stop using basis matrices for FMLLR.");
- opts->Register("fmllr-num-bases", &num_fmllr_bases, module+
- "Number of FMLLR basis matrices.");
- opts->Register("fmllr-bases-occ-scale", &bases_occ_scale, module+
- "Scale per-speaker count to determine number of CMLLR bases.");
-}
-
-
-/** \class SgmmFmllrGlobalParams
- * Global adaptation parameters.
- */
-class SgmmFmllrGlobalParams {
- public:
- void Init(const AmSgmm &sgmm, const Vector<BaseFloat> &state_occs);
- void Write(std::ostream &out_stream, bool binary) const;
- void Read(std::istream &in_stream, bool binary);
- bool IsEmpty() const {
- return (pre_xform_.NumRows() == 0 || inv_xform_.NumRows() == 0 ||
- mean_scatter_.Dim() == 0);
- }
- bool HasBasis() const { return fmllr_bases_.size() != 0; }
-
- /// Pre-transform matrix. Dim is [D][D+1].
- Matrix<BaseFloat> pre_xform_;
- /// Inverse of pre-transform. Dim is [D][D+1].
- Matrix<BaseFloat> inv_xform_;
- /// Diagonal of mean-scatter matrix. Dim is [D]
- Vector<BaseFloat> mean_scatter_;
- /// \tilde{W}_b. [b][d][d], dim is [B][D][D+1].
- std::vector< Matrix<BaseFloat> > fmllr_bases_;
-};
-
-inline void SgmmFmllrGlobalParams::Init(const AmSgmm &sgmm,
- const Vector<BaseFloat> &state_occs) {
- sgmm.ComputeFmllrPreXform(state_occs, &pre_xform_, &inv_xform_,
- &mean_scatter_);
-}
-
-/** \class FmllrSgmmAccs
- * Class for computing the accumulators needed for the maximum-likelihood
- * estimate of FMLLR transforms for a subspace GMM acoustic model.
- */
-class FmllrSgmmAccs {
- public:
- FmllrSgmmAccs() : dim_(-1) {}
- ~FmllrSgmmAccs() {}
-
- void Init(int32 dim, int32 num_gaussians);
- void SetZero() { stats_.SetZero(); }
-
- void Write(std::ostream &out_stream, bool binary) const;
- void Read(std::istream &in_stream, bool binary, bool add);
-
- /// Accumulation routine that computes the Gaussian posteriors and calls
- /// the AccumulateFromPosteriors function with the computed posteriors.
- /// The 'data' argument is not FMLLR-transformed and is needed in addition
- /// to the the 'frame_vars' since the latter only contains a copy of the
- /// transformed feature vector.
- BaseFloat Accumulate(const AmSgmm &sgmm,
- const SgmmPerSpkDerivedVars &spk,
- const VectorBase<BaseFloat> &data,
- const SgmmPerFrameDerivedVars &frame_vars,
- int32 state_index, BaseFloat weight);
-
- void AccumulateFromPosteriors(const AmSgmm &sgmm,
- const SgmmPerSpkDerivedVars &spk,
- const VectorBase<BaseFloat> &data,
- const std::vector<int32> &gauss_select,
- const Matrix<BaseFloat> &posteriors,
- int32 state_index);
-
- void AccumulateForFmllrSubspace(const AmSgmm &sgmm,
- const SgmmFmllrGlobalParams &fmllr_globals,
- SpMatrix<double> *grad_scatter);
-
- BaseFloat FmllrObjGradient(const AmSgmm &sgmm,
- const Matrix<BaseFloat> &xform,
- Matrix<BaseFloat> *grad_out,
- Matrix<BaseFloat> *G_out) const;
-
- /// Computes the FMLLR transform from the accumulated stats, using the
- /// pre-transforms in fmllr_globals. Expects the transform matrix out_xform
- /// to be initialized to the correct size. Returns true if the transform was
- /// updated (i.e. had enough counts).
- bool Update(const AmSgmm &model,
- const SgmmFmllrGlobalParams &fmllr_globals,
- const SgmmFmllrConfig &opts, Matrix<BaseFloat> *out_xform,
- BaseFloat *frame_count, BaseFloat *auxf_improv) const;
-
- /// Accessors
- int32 Dim() const { return dim_; }
- const AffineXformStats &stats() const { return stats_; }
-
- private:
- AffineXformStats stats_; ///< Accumulated stats
- int32 dim_; ///< Dimension of feature vectors
-
- // Cannot have copy constructor and assigment operator
- KALDI_DISALLOW_COPY_AND_ASSIGN(FmllrSgmmAccs);
-};
-
-/// Computes the fMLLR basis matrices given the scatter of the vectorized
-/// gradients (eq: B.10). The result is stored in 'fmllr_globals'.
-/// The actual number of bases may be less than 'num_fmllr_bases' depending
-/// on the feature dimension and number of eigenvalues greater than 'min_eig'.
-void EstimateSgmmFmllrSubspace(const SpMatrix<double> &fmllr_grad_scatter,
- int32 num_fmllr_bases, int32 feat_dim,
- SgmmFmllrGlobalParams *fmllr_globals,
- double min_eig = 0.0);
-
-} // namespace kaldi
-
-#endif // KALDI_SGMM_FMLLR_SGMM_H_
diff --git a/src/sgmm/sgmm-clusterable.cc b/src/sgmm/sgmm-clusterable.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-// sgmm/sgmm-clusterable.cc
-
-// Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "sgmm/sgmm-clusterable.h"
-#include "hmm/hmm-utils.h"
-
-namespace kaldi {
-
-void SgmmClusterable::Accumulate(
- const SgmmPerFrameDerivedVars &per_frame_vars,
- int32 j, // state index in original SGMM.
- BaseFloat weight) {
- Matrix<BaseFloat> post;
- KALDI_ASSERT(weight >= 0.0); // Doesn't make sense to use negative weights here.
- // Compute Gaussian-level posteriors.
- // Note: "post" is indexed by Gaussian-selection index.
- sgmm_.ComponentPosteriors(per_frame_vars, j, &post);
- if (weight != 1.0) post.Scale(weight);
- const std::vector<int32> &gselect = per_frame_vars.gselect;
- for (int32 ki = 0; ki < gselect.size(); ki++) {
- int32 i = gselect[ki];
- BaseFloat gamma = 0.0; // Sum the weight over all the vectors (index m) in
- // the state. In sensible cases there should be just one vector per state
- // at the point where we do this, though.
- for (int32 m = 0; m < post.NumCols(); m++) gamma += post(ki, m);
- gamma_(i) += gamma;
- y_.AddVec(gamma, per_frame_vars.zti.Row(ki));
- }
- // Invalidate my_H_, if present, since it's not efficient to
- // keep it updated during accumulation.
- if (my_H_.NumRows() != 0)
- my_H_.Resize(0);
-}
-
-BaseFloat SgmmClusterable::Objf() const {
- // Objective function consists of the expected log-likelihood of
- // a weight (assuming we estimate the weights directly as parameters
- // instead of the whole subspace thing on the weights), plus
- // the auxiliary function improvement we would get from estimating
- // the state vector v_j starting from zero. Note: zero is an
- // arbitrary starting point-- we could use any value as long as
- // we were consistent.
- KALDI_ASSERT(static_cast<int32>(H_.size()) == sgmm_.NumGauss());
- if (my_H_.NumRows() == 0.0) {
- SgmmClusterable *s = static_cast<SgmmClusterable*>(this->Copy()); // will
- // set up my_H_, which we need.
- BaseFloat ans = s->Objf();
- delete s;
- return ans;
- }
- double ans = 0.0;
- double tot_gamma = gamma_.Sum(), tot_gamma2 = 0.0;
- if (tot_gamma == 0.0) return 0.0;
- int32 I = gamma_.Dim();
-
- for (int32 i = 0; i < I; i++) {
- double gamma = gamma_(i);
- if (gamma > 0.0) { // Note: should not be negative-- if it is, due to
- double prob = gamma / tot_gamma;
- if (prob > 0.0) { // Note: prob could be zero due to underflow-- this
- // happened! [we can get tiny values due to floating-point roundoff
- // while subtracting clusterable objects].
- ans += gamma * Log(gamma / tot_gamma);
- }
- }
- tot_gamma2 += gamma;
- }
- if (tot_gamma2 == 0.0)
- return 0.0; // No positive elements... maybe small negative ones were from
- // round off.
-
- // objf improvement is y^T H^{-1} y.
- // We'll try to compute this using Cholesky, first, which is more
- // efficient; if this fails or appears to lead to big values,
- // we'll back off to a more efficient SVD-based implementation.
- try {
- TpMatrix<double> C(my_H_.NumRows());
- C.Cholesky(my_H_);
- C.Invert();
- for (int32 i = 0; i < C.NumRows(); i++)
- if (fabs(C(i, i)) > 100.0) {
- KALDI_VLOG(3) << "Condion-number probably bad: element is "
- << C(i, i);
- throw std::runtime_error("Bad condition number"); // back off to SVD.
- }
- // Note: assuming things are well preconditioned, the elements
- // C(i,i) should be of the rough magnitude 1/sqrt(count).
- Vector<double> yC(C.NumRows());
- // Note: if we decompose H = C C^T, then the line below
- // does yC = C^{-1} y. Note: we are computing the inner
- // product y^T H^{-1} y. H^{-1} = C^{-T} C^{-1}, so
- // y^T H^{-1} y = y^T C^{-T} C^{-1} y = yC^T yC.
- yC.AddTpVec(1.0, C, kNoTrans, y_, 0.0);
- ans += 0.5 * VecVec(yC, yC);
- } catch (...) { // Choleksy threw, or we detected bad condition.
- // we'll do this using an SVD-based implementation that will
- // deal with non-invertible matrices.
- KALDI_VLOG(3) << "Backing off to SVD-based objective computation.";
- Vector<double> v(y_.Dim()); // Initialized automatically to zero.
- ans += SolveQuadraticProblem(my_H_, y_, SolverOptions(), &v); // The objective function
- // change from estimating this vector.
- }
- return ans;
-}
-
-void SgmmClusterable::SetZero() {
- gamma_.SetZero();
- y_.SetZero();
- my_H_.SetZero(); // Should work even if empty.
-}
-
-void SgmmClusterable::Add(const Clusterable &other_in) {
- const SgmmClusterable *other =
- static_cast<const SgmmClusterable*>(&other_in);
- gamma_.AddVec(1.0, other->gamma_);
- y_.AddVec(1.0, other->y_);
- if (!H_.empty()) { // we need to compute my_H_.
- if (my_H_.NumRows() != 0 && other->my_H_.NumRows() != 0)
- my_H_.AddSp(1.0, other->my_H_);
- else {
- my_H_.Resize(0);
- ComputeH();
- }
- }
-}
-
-void SgmmClusterable::Sub(const Clusterable &other_in) {
- const SgmmClusterable *other =
- static_cast<const SgmmClusterable*>(&other_in);
- gamma_.AddVec(-1.0, other->gamma_);
- y_.AddVec(-1.0, other->y_);
- if (!H_.empty()) {
- if (my_H_.NumRows() != 0 && other->my_H_.NumRows() != 0)
- my_H_.AddSp(-1.0, other->my_H_);
- else {
- my_H_.Resize(0);
- ComputeH();
- }
- }
-}
-
-BaseFloat SgmmClusterable::Normalizer() const {
- return gamma_.Sum();
-}
-
-Clusterable *SgmmClusterable::Copy() const {
- SgmmClusterable *ans = new SgmmClusterable(sgmm_, H_);
- ans->gamma_.CopyFromVec(gamma_);
- ans->y_.CopyFromVec(y_);
- if (!H_.empty()) {
- if (my_H_.NumRows() == 0.0) ans->ComputeH();
- else {
- ans->my_H_.Resize(my_H_.NumRows());
- ans->my_H_.CopyFromSp(my_H_);
- }
- }
- return ans;
-}
-
-void SgmmClusterable::Scale(BaseFloat f) {
- KALDI_ASSERT(f >= 0.0);
- gamma_.Scale(f);
- y_.Scale(f);
- if (my_H_.NumRows() != 0) my_H_.Scale(f);
-}
-
-void SgmmClusterable::Write(std::ostream &os, bool binary) const {
- gamma_.Write(os, binary);
- y_.Write(os, binary);
-}
-
-Clusterable *SgmmClusterable::ReadNew(std::istream &is, bool binary) const {
- SgmmClusterable *ans = new SgmmClusterable(sgmm_, H_);
- ans->gamma_.Read(is, binary);
- ans->y_.Read(is, binary);
- if (!H_.empty()) ans->ComputeH();
- return ans;
-}
-
-
-bool AccumulateSgmmTreeStats(const TransitionModel &trans_model,
- const AmSgmm &am_sgmm,
- const std::vector<SpMatrix<double> > &H,
- int N, // context window size.
- int P, // central position.
- const std::vector<int32> &ci_phones, // must be sorted
- const std::vector<int32> &alignment,
- const std::vector<std::vector<int32> > &gselect,
- const SgmmPerSpkDerivedVars &per_spk_vars,
- const Matrix<BaseFloat> &features,
- std::map<EventType, SgmmClusterable*> *stats) {
- KALDI_ASSERT(IsSortedAndUniq(ci_phones));
- std::vector<std::vector<int32> > split_alignment;
- bool ans = SplitToPhones(trans_model, alignment, &split_alignment);
- if (!ans) {
- KALDI_WARN << "AccumulateTreeStats: bad alignment.";
- return false;
- }
- int t = 0;
- SgmmPerFrameDerivedVars per_frame_vars;
-
- KALDI_ASSERT(features.NumRows() == static_cast<int32>(alignment.size())
- && alignment.size() == gselect.size());
- for (int i = -N; i < static_cast<int>(split_alignment.size()); i++) {
- // consider window starting at i, only if i+P is within
- // list of phones.
- if (i + P >= 0 && i + P < static_cast<int>(split_alignment.size())) {
- int32 central_phone = trans_model.TransitionIdToPhone(split_alignment[i+P][0]);
- bool is_ctx_dep = ! std::binary_search(ci_phones.begin(),
- ci_phones.end(),
- central_phone);
- EventType evec;
- for (int j = 0; j < N; j++) {
- int phone;
- if (i + j >= 0 && i + j < static_cast<int>(split_alignment.size()))
- phone = trans_model.TransitionIdToPhone(split_alignment[i+j][0]);
- else
- phone = 0; // ContextDependency class uses 0 to mean "out of window".
-
- if (is_ctx_dep || j == P)
- evec.push_back(std::make_pair(static_cast<EventKeyType>(j), static_cast<EventValueType>(phone)));
- }
- for (int j = 0; j < static_cast<int>(split_alignment[i+P].size());j++) {
- // for central phone of this window...
- EventType evec_more(evec);
- int32 pdf_id = trans_model.TransitionIdToPdf(split_alignment[i+P][j]),
- pdf_class = trans_model.TransitionIdToPdfClass(split_alignment[i+P][j]);
- // pdf_id represents the acoustic state in the current model.
- // pdf_class will normally by 0, 1 or 2 for a 3-state HMM.
-
- std::pair<EventKeyType, EventValueType> pr(kPdfClass, pdf_class);
- evec_more.push_back(pr);
- std::sort(evec_more.begin(), evec_more.end()); // these must be sorted!
- if (stats->count(evec_more) == 0)
- (*stats)[evec_more] = new SgmmClusterable(am_sgmm, H);
-
- am_sgmm.ComputePerFrameVars(features.Row(t), gselect[t], per_spk_vars, 0.0,
- &per_frame_vars);
- BaseFloat weight = 1.0; // weight is one, since we have alignment.
- (*stats)[evec_more]->Accumulate(per_frame_vars, pdf_id, weight);
- t++;
- }
- }
- }
- KALDI_ASSERT(t == static_cast<int>(alignment.size()));
- return true;
-}
-
-void SgmmClusterable::ComputeH() {
- // We're computing my_H_, as a weighted sum of H_, with gamma_ as the
- // weights.
- KALDI_ASSERT(!H_.empty() && my_H_.NumRows() == 0); // Invalid to call this if H_ empty,
- // or my_H_ already set up.
- my_H_.Resize(H_[0].NumRows()); // will initialize to zero.
- KALDI_ASSERT(static_cast<int32>(H_.size()) == gamma_.Dim());
- for (int32 i = 0; i < gamma_.Dim(); i++) {
- double gamma = gamma_(i);
- if (gamma > 0.0) my_H_.AddSp(gamma, H_[i]);
- }
-}
-
-
-} // end namespace kaldi
diff --git a/src/sgmm/sgmm-clusterable.h b/src/sgmm/sgmm-clusterable.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// sgmm/sgmm-clusterable.h
-
-// Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_SGMM_SGMM_CLUSTERABLE_H_
-#define KALDI_SGMM_SGMM_CLUSTERABLE_H_
-
-#include <vector>
-#include <queue>
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "itf/clusterable-itf.h"
-
-namespace kaldi {
-
-/// This header defines an object that can be used to create decision
-/// trees using a form of SGMM statistics. It is analogous to the
-/// GaussClusterable object, but uses the SGMM. The auxiliary function
-/// it uses is related to the normal SGMM auxiliary function, but for
-/// efficiency it uses a simpler model on the weights, which is equivalent
-/// to assuming the weights w_{ji} [there no index m since we assume one
-/// mixture per state!] are directly estimated using ML, instead of being
-/// computed from v_j and w_i as in the actual SGMM.
-
-class SgmmClusterable: public Clusterable {
- public:
- SgmmClusterable(const AmSgmm &sgmm,
- const std::vector< SpMatrix<double> > &H): // H can be empty vector
- // at initialization. Used to cache something from the model.
- sgmm_(sgmm),
- H_(H),
- gamma_(sgmm.NumGauss()),
- y_(sgmm.PhoneSpaceDim()) { }
- virtual std::string Type() const { return "sgmm"; }
-
- /// compare with the Accumulate function of MleAmSgmmAccs
- /// Note: the pdf-index j, relating to the original SGMM
- /// in sgmm_, is only needed to select the right vector to
- /// compute Gaussian-level alignments with.
- void Accumulate(const SgmmPerFrameDerivedVars &frame_vars,
- int32 j,
- BaseFloat weight);
-
- virtual BaseFloat Objf() const;
- virtual void SetZero();
- virtual void Add(const Clusterable &other_in);
- virtual void Sub(const Clusterable &other_in);
- virtual BaseFloat Normalizer() const;
- virtual Clusterable *Copy() const;
- virtual void Scale(BaseFloat f);
- virtual void Write(std::ostream &os, bool binary) const;
- virtual Clusterable *ReadNew(std::istream &is, bool binary) const;
- virtual ~SgmmClusterable() {}
-
- const Vector<double> &gamma () const { return gamma_; }
- const Vector<double> &y() const { return y_; }
- private:
- void ComputeH(); // Compute the quantity my_H_, from gamma_ and H_.
-
- const AmSgmm &sgmm_; // Reference to the SGMM object, needed to compute
- // objective functions.
- const std::vector< SpMatrix<double> > &H_; // Reference to a vector of SpMatrix which
- // should have been computed from the model using ComputeH(). Needed for Objf() function.
- Vector<double> gamma_; // Occupation counts for each Gaussian index. Comparable
- // to the gamma_{jmi} statistics in the SGMM paper.
- Vector<double> y_; // Statistics comparable to the y_{jm} statistics in the SGMM
- // paper.
-
- SpMatrix<double> my_H_; // This quantity is a weighted sum over the H quantities,
- // weighted by gamma_(i). It's only nonempty if the H_ matrix is nonempty.
- // This quantity is never written to disk; it is to be viewed as a kind of
- // cache, present only for purposes of fast objective-function computation.
-};
-
-
-/// Comparable to AccumulateTreeStats, but this version
-/// accumulates stats of type SgmmClusterable. Returns
-/// true on success.
-bool AccumulateSgmmTreeStats(const TransitionModel &trans_model,
- const AmSgmm &am_sgmm,
- const std::vector<SpMatrix<double> > &H, // this is a ref. to temp.
- // storage needed in the clusterable class... can be empty
- // during accumulation as it doesn't call Objf().
- int N, // context window size.
- int P, // central position.
- const std::vector<int32> &ci_phones, // must be sorted
- const std::vector<int32> &alignment,
- const std::vector<std::vector<int32> > &gselect,
- const SgmmPerSpkDerivedVars &per_spk_vars,
- const Matrix<BaseFloat> &features,
- std::map<EventType, SgmmClusterable*> *stats);
-
-
-} // end namespace kaldi
-
-#endif // KALDI_SGMM_SGMM_CLUSTERABLE_H_
diff --git a/src/sgmmbin/Makefile b/src/sgmmbin/Makefile
--- a/src/sgmmbin/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-
-all:
-EXTRA_CXXFLAGS = -Wno-sign-compare
-include ../kaldi.mk
-
-BINFILES = init-ubm sgmm-align-compiled sgmm-acc-stats-ali \
- sgmm-sum-accs sgmm-est sgmm-decode-faster sgmm-init sgmm-gselect \
- sgmm-est-fmllr sgmm-acc-stats sgmm-est-spkvecs sgmm-post-to-gpost \
- sgmm-acc-stats-gpost sgmm-est-spkvecs-gpost sgmm-comp-prexform \
- sgmm-est-fmllr-gpost sgmm-acc-fmllrbasis-ali sgmm-est-fmllrbasis \
- sgmm-calc-distances sgmm-normalize sgmm-latgen-simple \
- sgmm-latgen-faster sgmm-rescore-lattice sgmm-copy sgmm-write-ubm \
- sgmm-mixup sgmm-info sgmm-acc-tree-stats sgmm-sum-tree-stats \
- sgmm-build-tree sgmm-cluster-phones sgmm-init-from-tree-stats \
- sgmm-est-ebw sgmm-acc-stats2 sgmm-est-multi
-
-OBJFILES =
-
-
-
-TESTFILES =
-
-
-ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
- ../fstext/kaldi-fstext.a ../sgmm/kaldi-sgmm.a ../hmm/kaldi-hmm.a \
- ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
- ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
- ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \
- ../base/kaldi-base.a
-
-include ../makefiles/default_rules.mk
diff --git a/src/sgmmbin/init-ubm.cc b/src/sgmmbin/init-ubm.cc
--- a/src/sgmmbin/init-ubm.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// sgmmbin/init-ubm.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/kaldi-io.h"
-#include "gmm/diag-gmm.h"
-#include "gmm/full-gmm.h"
-#include "gmm/am-diag-gmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
- typedef kaldi::BaseFloat BaseFloat;
-
- const char *usage =
- "Cluster the Gaussians in a diagonal-GMM acoustic model\n"
- "to a single full-covariance or diagonal-covariance GMM.\n"
- "Usage: init-ubm [options] <model-file> <state-occs> <gmm-out>\n";
-
- bool binary_write = true, fullcov_ubm = true;
- kaldi::ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
- po.Register("fullcov-ubm", &fullcov_ubm, "Write out full covariance UBM.");
- kaldi::UbmClusteringOptions ubm_opts;
- ubm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 3) {
- po.PrintUsage();
- exit(1);
- }
- ubm_opts.Check();
-
- std::string model_in_filename = po.GetArg(1),
- occs_in_filename = po.GetArg(2),
- gmm_out_filename = po.GetArg(3);
-
- kaldi::AmDiagGmm am_gmm;
- kaldi::TransitionModel trans_model;
- {
- bool binary_read;
- kaldi::Input ki(model_in_filename, &binary_read);
- trans_model.Read(ki.Stream(), binary_read);
- am_gmm.Read(ki.Stream(), binary_read);
- }
-
- kaldi::Vector<BaseFloat> state_occs;
- state_occs.Resize(am_gmm.NumPdfs());
- {
- bool binary_read;
- kaldi::Input ki(occs_in_filename, &binary_read);
- state_occs.Read(ki.Stream(), binary_read);
- }
-
- kaldi::DiagGmm ubm;
- ClusterGaussiansToUbm(am_gmm, state_occs, ubm_opts, &ubm);
- if (fullcov_ubm) {
- kaldi::FullGmm full_ubm;
- full_ubm.CopyFromDiagGmm(ubm);
- kaldi::Output ko(gmm_out_filename, binary_write);
- full_ubm.Write(ko.Stream(), binary_write);
- } else {
- kaldi::Output ko(gmm_out_filename, binary_write);
- ubm.Write(ko.Stream(), binary_write);
- }
-
- KALDI_LOG << "Written UBM to " << gmm_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc b/src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// sgmmbin/sgmm-acc-fmllrbasis-ali.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/fmllr-sgmm.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- const char *usage =
- "Accumulate stats for FMLLR bases training.\n"
- "Usage: sgmm-acc-fmllrbasis-ali [options] <model-in> <feature-rspecifier> "
- "<alignments-rspecifier> <spk2utt-rspecifier> <stats-out>\n"
- "e.g.: sgmm-acc-fmllrbasis-ali 1.mdl scp:train.scp ark:1.ali 1.acc\n";
-
- ParseOptions po(usage);
- bool binary_write = true;
- std::string gselect_rspecifier, spkvecs_rspecifier, silphones_str;
- BaseFloat sil_weight = 0.0;
- kaldi::SgmmGselectConfig sgmm_opts;
- po.Register("binary", &binary_write, "Write output in binary mode");
- po.Register("gselect", &gselect_rspecifier,
- "Precomputed Gaussian indices (rspecifier)");
- po.Register("spk-vecs", &spkvecs_rspecifier,
- "Speaker vectors to use during aligment (rspecifier)");
- po.Register("sil-phone-list", &silphones_str,
- "Colon-separated list of phones (to weigh differently)");
- po.Register("sil-weight", &sil_weight, "Weight for \"silence\" phones.");
- sgmm_opts.Register(&po);
- po.Read(argc, argv);
-
- if (po.NumArgs() != 5) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- alignments_rspecifier = po.GetArg(3),
- spk2utt_rspecifier = po.GetArg(4),
- accs_wxfilename = po.GetArg(5);
-
- typedef kaldi::int32 int32;
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- SgmmFmllrGlobalParams fmllr_globals;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- fmllr_globals.Read(ki.Stream(), binary);
- }
-
- SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
-
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
- RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
- std::vector<int32> silence_phones;
- if (!SplitStringToIntegers(silphones_str, ":", false, &silence_phones)) {
- KALDI_ERR << "Silence-phones string has wrong format "
- << silphones_str;
- }
- ConstIntegerSet<int32> silence_set(silence_phones); // faster lookup.
-
-
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
- SpMatrix<double> fmllr_grad_scatter;
- int32 dim = am_sgmm.FeatureDim();
- fmllr_grad_scatter.Resize(dim * (dim + 1), kSetZero);
- FmllrSgmmAccs spk_stats;
- spk_stats.Init(dim, am_sgmm.NumGauss());
-
- double tot_like = 0.0, tot_t = 0.0;
- int32 num_done = 0, num_no_alignment = 0, num_other_error = 0;
-
- for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
- spk_stats.SetZero();
- string spk = spk2utt_reader.Key();
- const std::vector<string> &uttlist = spk2utt_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(spk)) {
- spk_vars.v_s = spkvecs_reader.Value(spk);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << spk;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- for (size_t i = 0; i < uttlist.size(); i++) {
- std::string utt = uttlist[i];
- if (!alignments_reader.HasKey(utt)) {
- num_no_alignment++;
- continue;
- }
- const std::vector<int32> &alignment = alignments_reader.Value(utt);
-
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find features for utterance " << utt;
- num_other_error++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-
- if (alignment.size() != feats.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (alignment.size()) <<
- " vs. "<< (feats.NumRows());
- num_other_error++;
- continue;
- }
-
- bool have_gselect = false;
- if (gselect_reader.IsOpen()) {
- if (gselect_reader.HasKey(utt)) {
- have_gselect = (gselect_reader.Value(utt).size() == feats.NumRows());
- if (!have_gselect)
- KALDI_WARN << "Gaussian-selection info available for utterance "
- << utt << " has wrong size.";
- } else {
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt;
- }
- }
-
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : NULL);
- double file_like = 0.0, file_t = 0.0;
-
-
- for (size_t i = 0; i < alignment.size(); i++) {
- int32 tid = alignment[i]; // transition identifier.
- int32 pdf_id = trans_model.TransitionIdToPdf(tid),
- phone = trans_model.TransitionIdToPhone(tid);
- BaseFloat weight = 1.0;
- if (silence_set.count(phone) != 0) { // is a silence.
- if (sil_weight > 0.0)
- weight = sil_weight;
- else
- continue;
- }
-
- std::vector<int32> this_gselect;
- if (gselect != NULL)
- this_gselect = (*gselect)[i];
- else
- am_sgmm.GaussianSelection(sgmm_opts, feats.Row(i), &this_gselect);
- am_sgmm.ComputePerFrameVars(feats.Row(i), this_gselect, spk_vars, 0.0,
- &per_frame_vars);
- file_like +=
- spk_stats.Accumulate(am_sgmm, spk_vars, feats.Row(i),
- per_frame_vars, pdf_id, weight);
- file_t += weight;
- } // end looping over all the frames in the utterance
- KALDI_VLOG(1) << "Average likelihood for utterance " << utt << " is "
- << (file_like/file_t) << " over " << file_t << " frames";
- tot_like += file_like;
- tot_t += file_t;
- num_done++;
- if (num_done % 20 == 0)
- KALDI_VLOG(1) << "After " << num_done << " utterances: Average "
- << "likelihood per frame = " << (tot_like/tot_t)
- << ", over " << tot_t << " frames";
- } // end looping over all utterance for a given speaker
- spk_stats.AccumulateForFmllrSubspace(am_sgmm, fmllr_globals, &fmllr_grad_scatter);
- } // end looping over all speakers
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
- << " with no alignments, " << num_other_error
- << " with other errors.";
-
- KALDI_LOG << "Overall likelihood per frame frame = " << (tot_like/tot_t)
- << " over " << tot_t << " frames.";
-
- {
- Output ko(accs_wxfilename, binary_write);
- fmllr_grad_scatter.Write(ko.Stream(), binary_write);
- KALDI_LOG << "Written accs to: " << accs_wxfilename;
- }
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-acc-stats-ali.cc b/src/sgmmbin/sgmm-acc-stats-ali.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-// sgmmbin/sgmm-acc-stats-ali.cc
-
-// Copyright 2009-2012 Saarland University (author: Arnab Ghoshal);
-// Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- const char *usage =
- "Accumulate stats for SGMM training.\n"
- "Usage: sgmm-acc-stats-ali [options] <model-in> <feature-rspecifier> "
- "<alignments-rspecifier> <stats-out>\n"
- "e.g.: sgmm-acc-stats-ali 1.mdl 1.ali scp:train.scp ark:1.ali 1.acc\n";
-
- ParseOptions po(usage);
- bool binary = true;
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- std::string update_flags_str = "vMNwcSt";
- BaseFloat rand_prune = 1.0e-05;
- kaldi::SgmmGselectConfig sgmm_opts;
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("rand-prune", &rand_prune, "Randomized pruning threshold for posteriors");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to update: subset of vMNwcS.");
- sgmm_opts.Register(&po);
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- alignments_rspecifier = po.GetArg(3),
- accs_wxfilename = po.GetArg(4);
-
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Vector<double> transition_accs;
- if (acc_flags & kaldi::kSgmmTransitions)
- trans_model.InitStats(&transition_accs);
- MleAmSgmmAccs sgmm_accs(rand_prune);
- sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags);
-
- double tot_like = 0.0;
- kaldi::int64 tot_t = 0;
-
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
-
- RandomAccessInt32VectorVectorReader gselect_reader;
- if (!gselect_rspecifier.empty() && !gselect_reader.Open(gselect_rspecifier))
- KALDI_ERR << "Unable to open stream for gaussian-selection indices";
-
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- int32 num_done = 0, num_no_alignment = 0, num_other_error = 0;
- for (; !feature_reader.Done(); feature_reader.Next()) {
- std::string utt = feature_reader.Key();
- if (!alignments_reader.HasKey(utt)) {
- num_no_alignment++;
- } else {
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- const std::vector<int32> &alignment = alignments_reader.Value(utt);
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == mat.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)\n";
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- if (alignment.size() != mat.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (alignment.size()) <<
- " vs. "<< (mat.NumRows());
- num_other_error++;
- continue;
- }
-
- num_done++;
- BaseFloat tot_like_this_file = 0.0;
-
- for (size_t i = 0; i < alignment.size(); i++) {
- int32 tid = alignment[i], // transition identifier.
- pdf_id = trans_model.TransitionIdToPdf(tid);
- if (acc_flags & kaldi::kSgmmTransitions)
- trans_model.Accumulate(1.0, tid, &transition_accs);
- std::vector<int32> this_gselect;
- if (!gselect->empty()) this_gselect = (*gselect)[i];
- else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect);
- am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0,
- &per_frame_vars);
- tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars,
- spk_vars.v_s, pdf_id, 1.0,
- acc_flags);
- }
-
- sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance.
-
- KALDI_VLOG(2) << "Average like for this file is "
- << (tot_like_this_file/alignment.size()) << " over "
- << alignment.size() <<" frames.";
- tot_like += tot_like_this_file;
- tot_t += alignment.size();
- if (num_done % 50 == 0) {
- KALDI_LOG << "Processed " << num_done << " utterances; for utterance "
- << utt << " avg. like is "
- << (tot_like_this_file/alignment.size())
- << " over " << alignment.size() <<" frames.";
- }
- }
- }
- KALDI_LOG << "Overall like per frame (Gaussian only) = "
- << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
- << " with no alignments, " << num_other_error
- << " with other errors.";
-
- {
- Output ko(accs_wxfilename, binary);
- // TODO(arnab): Ideally, we shouldn't be writing transition accs if not
- // asked for, but that will complicate reading later. To be fixed?
- transition_accs.Write(ko.Stream(), binary);
- sgmm_accs.Write(ko.Stream(), binary);
- }
- KALDI_LOG << "Written accs.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-acc-stats-gpost.cc b/src/sgmmbin/sgmm-acc-stats-gpost.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-// sgmmbin/sgmm-acc-stats-gpost.cc
-
-// Copyright 2009-2012 Saarland University (Author: Arnab Ghoshal)
-// Microsoft Corporation;
-// Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-
-
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- try {
- const char *usage =
- "Accumulate stats for SGMM training, given Gaussian-level posteriors\n"
- "Usage: sgmm-acc-stats-gpost [options] <model-in> <feature-rspecifier> "
- "<gpost-rspecifier> <stats-out>\n"
- "e.g.: sgmm-acc-stats-gpost 1.mdl 1.ali scp:train.scp ark, s, cs:- 1.acc\n";
-
- ParseOptions po(usage);
- bool binary = true;
- std::string spkvecs_rspecifier, utt2spk_rspecifier;
- std::string update_flags_str = "vMNwcSt";
- BaseFloat rand_prune = 1.0e-05;
-
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to update: subset of vMNwcS.");
- po.Read(argc, argv);
-
- kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- gpost_rspecifier = po.GetArg(3),
- accs_wxfilename = po.GetArg(4);
-
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- // Initialize the readers before the model, as this can avoid
- // crashes on systems with low virtual memory.
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessSgmmGauPostReader gpost_reader(gpost_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Vector<double> transition_accs;
- if (acc_flags & kaldi::kSgmmTransitions)
- trans_model.InitStats(&transition_accs);
- MleAmSgmmAccs sgmm_accs(rand_prune);
- sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags);
-
- double tot_t = 0.0;
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
- for (; !feature_reader.Done(); feature_reader.Next()) {
- std::string utt = feature_reader.Key();
- if (!gpost_reader.HasKey(utt)) {
- num_no_posterior++;
- } else {
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- const SgmmGauPost &gpost = gpost_reader.Value(utt);
-
- if (gpost.size() != mat.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (gpost.size()) <<
- " vs. "<< (mat.NumRows());
- num_other_error++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- num_done++;
- BaseFloat tot_weight = 0.0;
-
- for (size_t i = 0; i < gpost.size(); i++) {
- const std::vector<int32> &gselect = gpost[i].gselect;
- am_sgmm.ComputePerFrameVars(mat.Row(i), gselect, spk_vars, 0.0,
- &per_frame_vars);
-
- for (size_t j = 0; j < gpost[i].tids.size(); j++) {
- int32 tid = gpost[i].tids[j], // transition identifier.
- pdf_id = trans_model.TransitionIdToPdf(tid);
-
- BaseFloat weight = gpost[i].posteriors[j].Sum();
- if (acc_flags & kaldi::kSgmmTransitions)
- trans_model.Accumulate(weight, tid, &transition_accs);
- sgmm_accs.AccumulateFromPosteriors(am_sgmm, per_frame_vars,
- gpost[i].posteriors[j],
- spk_vars.v_s,
- pdf_id, acc_flags);
- tot_weight += weight;
- }
- }
-
- sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance.
-
- tot_t += tot_weight;
- if (num_done % 50 == 0)
- KALDI_LOG << "Processed " << num_done << " utterances";
- }
- }
- KALDI_LOG << "Overall number of frames is " << tot_t;
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
- << " with no posteriors, " << num_other_error
- << " with other errors.";
-
- {
- Output ko(accs_wxfilename, binary);
- // TODO(arnab): Ideally, we shouldn't be writing transition accs if not
- // asked for, but that will complicate reading later. To be fixed?
- transition_accs.Write(ko.Stream(), binary);
- sgmm_accs.Write(ko.Stream(), binary);
- }
- KALDI_LOG << "Written accs.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-acc-stats.cc b/src/sgmmbin/sgmm-acc-stats.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-// sgmmbin/sgmm-acc-stats.cc
-
-// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal),
-// 2014 Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/posterior.h"
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- try {
- const char *usage =
- "Accumulate stats for SGMM training.\n"
- "Usage: sgmm-acc-stats [options] <model-in> <feature-rspecifier> "
- "<posteriors-rspecifier> <stats-out>\n"
- "e.g.: sgmm-acc-stats 1.mdl 1.ali scp:train.scp 'ark:ali-to-post 1.ali ark:-|' 1.acc\n";
-
- ParseOptions po(usage);
- bool binary = true;
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- std::string update_flags_str = "vMNwcSt";
- BaseFloat rand_prune = 1.0e-05;
- SgmmGselectConfig sgmm_opts;
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate "
- "stats for: subset of vMNwcS.");
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- posteriors_rspecifier = po.GetArg(3),
- accs_wxfilename = po.GetArg(4);
-
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- // Initialize the readers before the model, as the model can
- // be large, and we don't want to call fork() after reading it if
- // virtual memory may be low.
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Vector<double> transition_accs;
- if (acc_flags & kaldi::kSgmmTransitions)
- trans_model.InitStats(&transition_accs);
- MleAmSgmmAccs sgmm_accs(rand_prune);
- sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags);
-
- double tot_like = 0.0;
- double tot_t = 0;
-
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
- for (; !feature_reader.Done(); feature_reader.Next()) {
- std::string utt = feature_reader.Key();
- if (!posteriors_reader.HasKey(utt)) {
- num_no_posterior++;
- } else {
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- const Posterior &posterior = posteriors_reader.Value(utt);
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == mat.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- if (posterior.size() != mat.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (posterior.size()) <<
- " vs. "<< (mat.NumRows());
- num_other_error++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- num_done++;
- BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
-
- Posterior pdf_posterior;
- ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
- for (size_t i = 0; i < posterior.size(); i++) {
- if (posterior[i].empty())
- continue;
- std::vector<int32> this_gselect;
- if (!gselect->empty()) this_gselect = (*gselect)[i];
- else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect);
- am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0,
- &per_frame_vars);
-
- // Accumulates for SGMM.
- for (size_t j = 0; j < pdf_posterior[i].size(); j++) {
- int32 pdf_id = pdf_posterior[i][j].first;
- BaseFloat weight = pdf_posterior[i][j].second;
- tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars,
- spk_vars.v_s, pdf_id,
- weight, acc_flags)
- * weight;
- tot_weight += weight;
- }
-
- // Accumulates for transitions.
- for (size_t j = 0; j < posterior[i].size(); j++) {
- if (acc_flags & kaldi::kSgmmTransitions) {
- int32 tid = posterior[i][j].first;
- BaseFloat weight = posterior[i][j].second;
- trans_model.Accumulate(weight, tid, &transition_accs);
- }
- }
- }
-
- sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance.
-
- KALDI_VLOG(2) << "Average like for this file is "
- << (tot_like_this_file/tot_weight) << " over "
- << tot_weight <<" frames.";
- tot_like += tot_like_this_file;
- tot_t += tot_weight;
- if (num_done % 50 == 0) {
- KALDI_LOG << "Processed " << num_done << " utterances; for utterance "
- << utt << " avg. like is "
- << (tot_like_this_file/tot_weight)
- << " over " << tot_weight <<" frames.";
- }
- }
- }
- KALDI_LOG << "Overall like per frame (Gaussian only) = "
- << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
- << " with no posteriors, " << num_other_error
- << " with other errors.";
-
- {
- Output ko(accs_wxfilename, binary);
- // TODO(arnab): Ideally, we shouldn't be writing transition accs if not
- // asked for, but that will complicate reading later. To be fixed?
- transition_accs.Write(ko.Stream(), binary);
- sgmm_accs.Write(ko.Stream(), binary);
- }
- KALDI_LOG << "Written accs.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-acc-stats2.cc b/src/sgmmbin/sgmm-acc-stats2.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-// sgmmbin/sgmm-acc-stats2.cc
-
-// Copyright 2009-2012 Saarland University (Author: Arnab Ghoshal),
-// Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/posterior.h"
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- try {
- const char *usage =
- "Accumulate numerator and denominator stats for discriminative training\n"
- "of SGMMs (input is posteriors of mixed sign)\n"
- "Usage: sgmm-acc-stats2 [options] <model-in> <feature-rspecifier> "
- "<posteriors-rspecifier> <num-stats-out> <den-stats-out>\n"
- "e.g.: sgmm-acc-stats2 1.mdl 1.ali scp:train.scp ark:1.posts num.acc den.acc\n";
-
- ParseOptions po(usage);
- bool binary = true;
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- std::string update_flags_str = "vMNwcSt";
- BaseFloat rand_prune = 1.0e-05;
- SgmmGselectConfig sgmm_opts;
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate "
- "stats for: subset of vMNwcS.");
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str);
-
- if (po.NumArgs() != 5) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- posteriors_rspecifier = po.GetArg(3),
- num_accs_wxfilename = po.GetArg(4),
- den_accs_wxfilename = po.GetArg(5);
-
-
- using namespace kaldi;
- typedef kaldi::int32 int32;
- typedef kaldi::int64 int64;
-
- // Initialize the readers before the model, as the model can
- // be large, and we don't want to call fork() after reading it if
- // virtual memory may be low.
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Vector<double> num_transition_accs, den_transition_accs;
- if (acc_flags & kaldi::kSgmmTransitions) {
- trans_model.InitStats(&num_transition_accs);
- trans_model.InitStats(&den_transition_accs);
- }
- MleAmSgmmAccs num_sgmm_accs(rand_prune), den_sgmm_accs(rand_prune);
- num_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags);
- den_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags);
-
- double tot_like = 0.0, tot_weight = 0.0, tot_abs_weight = 0.0;
- int64 tot_frames = 0;
-
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
- for (; !feature_reader.Done(); feature_reader.Next()) {
- std::string utt = feature_reader.Key();
- if (!posteriors_reader.HasKey(utt)) {
- num_no_posterior++;
- } else {
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- const Posterior &posterior = posteriors_reader.Value(utt);
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == mat.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- if (posterior.size() != mat.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (posterior.size()) <<
- " vs. "<< (mat.NumRows());
- num_other_error++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- num_done++;
- BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0,
- tot_abs_weight_this_file = 0.0;
-
- for (size_t i = 0; i < posterior.size(); i++) {
- std::vector<int32> this_gselect;
- if (!gselect->empty()) this_gselect = (*gselect)[i];
- else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect);
- am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0,
- &per_frame_vars);
-
- for (size_t j = 0; j < posterior[i].size(); j++) {
- int32 tid = posterior[i][j].first, // transition identifier.
- pdf_id = trans_model.TransitionIdToPdf(tid);
- BaseFloat weight = posterior[i][j].second,
- abs_weight = std::abs(weight);
-
- if (acc_flags & kaldi::kSgmmTransitions) {
- trans_model.Accumulate(abs_weight, tid, weight > 0 ?
- &num_transition_accs : &den_transition_accs);
- }
- tot_like_this_file +=
- (weight > 0 ? num_sgmm_accs : den_sgmm_accs).Accumulate(
- am_sgmm, per_frame_vars, spk_vars.v_s, pdf_id,
- abs_weight, acc_flags)
- * weight;
- tot_weight_this_file += weight;
- tot_abs_weight_this_file += abs_weight;
- }
- }
- num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance.
- den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s);
-
- tot_like += tot_like_this_file;
- tot_weight += tot_weight_this_file;
- tot_abs_weight += tot_abs_weight_this_file;
- tot_frames += posterior.size();
- if (num_done % 50 == 0)
- KALDI_LOG << "Processed " << num_done << " utterances.";
- }
- }
- KALDI_LOG << "Overall weighted acoustic likelihood per frame was "
- << (tot_like/tot_frames) << " over " << tot_frames << " frames; "
- << "average weight per frame is " << (tot_weight/tot_frames)
- << ", average abs(weight) per frame is "
- << (tot_abs_weight/tot_frames);
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
- << " with no posteriors, " << num_other_error
- << " with other errors.";
-
- {
- Output ko(num_accs_wxfilename, binary);
- // TODO(arnab): Ideally, we shouldn't be writing transition accs if not
- // asked for, but that will complicate reading later. To be fixed?
- num_transition_accs.Write(ko.Stream(), binary);
- num_sgmm_accs.Write(ko.Stream(), binary);
- }
- {
- Output ko(den_accs_wxfilename, binary);
- den_transition_accs.Write(ko.Stream(), binary);
- den_sgmm_accs.Write(ko.Stream(), binary);
- }
- KALDI_LOG << "Written accs.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-acc-tree-stats.cc b/src/sgmmbin/sgmm-acc-tree-stats.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-// sgmmbin/sgmm-acc-tree-stats.cc
-
-// Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "tree/build-tree-utils.h"
-#include "sgmm/sgmm-clusterable.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- try {
- const char *usage =
- "Accumulate statistics for decision tree training.\n"
- "This version accumulates statistics in the form of state-specific "
- "SGMM stats; you need to use the program sgmm-build-tree to build "
- "the tree (and sgmm-sum-tree-accs to sum the stats).\n"
- "Usage: sgmm-acc-tree-stats [options] sgmm-model-in features-rspecifier "
- "alignments-rspecifier [tree-accs-out]\n"
- "e.g.: sgmm-acc-tree-stats --ci-phones=48:49 1.mdl scp:train.scp ark:1.ali 1.tacc\n";
-
- ParseOptions po(usage);
- bool binary = true;
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- string ci_phones_str;
- int N = 3, P = 1;
- SgmmGselectConfig sgmm_opts;
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("ci-phones", &ci_phones_str, "Colon-separated list of integer "
- "indices of context-independent phones.");
- po.Register("context-width", &N, "Context window size.");
- po.Register("central-position", &P,
- "Central context-window position (zero-based)");
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- if (po.NumArgs() < 3 || po.NumArgs() > 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string sgmm_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- alignment_rspecifier = po.GetArg(3),
- accs_wxfilename = po.GetOptArg(4);
-
- std::vector<int32> ci_phones;
- if (ci_phones_str != "") {
- SplitStringToIntegers(ci_phones_str, ":", false, &ci_phones);
- std::sort(ci_phones.begin(), ci_phones.end());
- if (!IsSortedAndUniq(ci_phones) || ci_phones[0] == 0) {
- KALDI_ERR << "Invalid set of ci_phones: " << ci_phones_str;
- }
- }
-
- TransitionModel trans_model;
- AmSgmm am_sgmm;
- std::vector<SpMatrix<double> > H; // Not initialized in this program-- not needed
- // as we don't call Objf() from stats.
- {
- bool binary;
- Input ki(sgmm_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- if (gselect_rspecifier.empty())
- KALDI_ERR << "--gselect option is required.";
-
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessInt32VectorReader alignment_reader(alignment_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- std::map<EventType, SgmmClusterable*> tree_stats;
-
- int num_done = 0, num_err = 0;
-
- for (; !feature_reader.Done(); feature_reader.Next()) {
- std::string utt = feature_reader.Key();
- if (!alignment_reader.HasKey(utt)) {
- num_err++;
- } else {
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- const std::vector<int32> &alignment = alignment_reader.Value(utt);
-
- if (!gselect_reader.HasKey(utt) ||
-
- gselect_reader.Value(utt).size() != mat.NumRows()) {
- KALDI_WARN << "No gselect information for utterance " << utt
- << " (or wrong size)";
- num_err++;
- continue;
- }
-
- const std::vector<std::vector<int32> > &gselect =
- gselect_reader.Value(utt);
-
- if (alignment.size() != mat.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (alignment.size())<<" vs. "<< (mat.NumRows());
- num_err++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- }
- } // else spk_vars is "empty"
-
-
- // The work gets done here.
- if (!AccumulateSgmmTreeStats(trans_model,
- am_sgmm,
- H,
- N, P,
- ci_phones,
- alignment,
- gselect,
- spk_vars,
- mat,
- &tree_stats)) {
- num_err++;
- } else {
- num_done++;
- if (num_done % 1000 == 0)
- KALDI_LOG << "Processed " << num_done << " utterances.";
- }
- }
- }
-
- BuildTreeStatsType stats; // Converting from a map to a vector of pairs.
-
- for (std::map<EventType, SgmmClusterable*>::const_iterator iter = tree_stats.begin();
- iter != tree_stats.end();
- iter++ ) {
- stats.push_back(std::make_pair(iter->first, static_cast<Clusterable*>(iter->second)));
- }
- tree_stats.clear();
-
- {
- Output ko(accs_wxfilename, binary);
- WriteBuildTreeStats(ko.Stream(), binary, stats);
- }
- KALDI_LOG << "Accumulated stats for " << num_done << " files, "
- << num_err << " failed.";
- KALDI_LOG << "Number of separate stats (context-dependent states) is "
- << stats.size();
- DeleteBuildTreeStats(&stats);
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-align-compiled.cc b/src/sgmmbin/sgmm-align-compiled.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// sgmmbin/sgmm-align-compiled.cc
-
-// Copyright 2009-2011 Microsoft Corporation; Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/hmm-utils.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/training-graph-compiler.h"
-#include "sgmm/decodable-am-sgmm.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- using fst::SymbolTable;
- using fst::VectorFst;
- using fst::StdArc;
-
- const char *usage =
- "Align features given [SGMM-based] models.\n"
- "Usage: sgmm-align-compiled [options] model-in graphs-rspecifier "
- "feature-rspecifier alignments-wspecifier\n"
- "e.g.: sgmm-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n";
-
- ParseOptions po(usage);
- bool binary = true;
- AlignConfig align_config;
- BaseFloat acoustic_scale = 1.0;
- BaseFloat transition_scale = 1.0;
- BaseFloat self_loop_scale = 1.0;
- BaseFloat log_prune = 5.0;
-
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- SgmmGselectConfig sgmm_opts;
-
- align_config.Register(&po);
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("log-prune", &log_prune, "Pruning beam used to reduce number "
- "of exp() evaluations.");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic "
- "likelihoods");
- po.Register("transition-scale", &transition_scale, "Scaling factor for "
- "some transition probabilities [see also self-loop-scale].");
- po.Register("self-loop-scale", &self_loop_scale, "Scaling factor for "
- "self-loop versus non-self-loop probability mass [controls "
- "most transition probabilities.]");
- po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices "
- "(rspecifier)");
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_in_filename = po.GetArg(1),
- fst_rspecifier = po.GetArg(2),
- feature_rspecifier = po.GetArg(3),
- alignment_wspecifier = po.GetArg(4);
-
- TransitionModel trans_model;
- AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- Int32VectorWriter alignment_writer(alignment_wspecifier);
-
- int32 num_done = 0, num_err = 0, num_retry = 0;
- double tot_like = 0.0;
- kaldi::int64 frame_count = 0;
-
- for (; !fst_reader.Done(); fst_reader.Next()) {
- std::string utt = fst_reader.Key();
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "No features found for utterance " << utt;
- num_err++;
- continue;
- }
- VectorFst<StdArc> decode_fst(fst_reader.Value());
- // stops copy-on-write of the fst by deleting the fst inside the reader,
- // since we're about to mutate the fst by adding transition probs.
- fst_reader.FreeCurrent();
-
- const Matrix<BaseFloat> &features = feature_reader.Value(utt);
- if (features.NumRows() == 0) {
- KALDI_WARN << "Empty features for utterance " << utt;
- num_err++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_err++;
- continue;
- }
- } // else spk_vars is "empty"
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == features.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- { // Add transition-probs to the FST.
- std::vector<int32> disambig_syms; // empty.
- AddTransitionProbs(trans_model, disambig_syms,
- transition_scale, self_loop_scale,
- &decode_fst);
- }
-
- DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars, trans_model,
- features, *gselect, log_prune, acoustic_scale);
-
- AlignUtteranceWrapper(align_config, utt,
- acoustic_scale, &decode_fst, &sgmm_decodable,
- &alignment_writer, NULL,
- &num_done, &num_err, &num_retry,
- &tot_like, &frame_count);
- }
-
- KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
- << " over " << frame_count<< " frames.";
- KALDI_LOG << "Retried " << num_retry << " out of "
- << (num_done + num_err) << " utterances.";
- KALDI_LOG << "Done " << num_done << ", errors on " << num_err;
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-build-tree.cc b/src/sgmmbin/sgmm-build-tree.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-// sgmmbin/sgmm-build-tree.cc
-
-// Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/hmm-topology.h"
-#include "tree/context-dep.h"
-#include "tree/build-tree.h"
-#include "tree/build-tree-utils.h"
-#include "sgmm/sgmm-clusterable.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "util/text-utils.h"
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Train decision tree\n"
- "Usage: sgmm-build-tree [options] <old-sgmm-in> <tree-stats-in> "
- "<roots-file> <questions-file> <tree-out> [<sgmm-out>]\n"
- "e.g.: sgmm-build-tree 0.sgmm streeacc roots.txt 1.qst tree\n";
-
- bool binary = true;
- int32 P = 1, N = 3;
-
- BaseFloat thresh = 300.0;
- BaseFloat cluster_thresh = -1.0; // negative means use smallest split in splitting phase as thresh.
- int32 max_leaves = 0;
- std::string occs_out_filename;
-
- ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("context-width", &N, "Context window size [must match "
- "acc-tree-stats]");
- po.Register("central-position", &P, "Central position in context window "
- "[must match acc-tree-stats]");
- po.Register("max-leaves", &max_leaves, "Maximum number of leaves to be "
- "used in tree-buliding (if positive)");
- po.Register("thresh", &thresh, "Log-likelihood change threshold for "
- "tree-building");
- po.Register("cluster-thresh", &cluster_thresh, "Log-likelihood change "
- "threshold for clustering after tree-building");
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 5) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string sgmm_filename = po.GetArg(1),
- stats_filename = po.GetArg(2),
- roots_filename = po.GetArg(3),
- questions_filename = po.GetArg(4),
- tree_out_filename = po.GetArg(5);
-
- // Following 2 variables derived from roots file.
- // phone_sets is sets of phones that share their roots.
- // Just one phone each for normal systems.
- std::vector<std::vector<int32> > phone_sets;
- std::vector<bool> is_shared_root;
- std::vector<bool> is_split_root;
- {
- Input ki(roots_filename.c_str());
- ReadRootsFile(ki.Stream(), &phone_sets, &is_shared_root, &is_split_root);
- }
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(sgmm_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- const HmmTopology &topo = trans_model.GetTopo();
- std::vector<SpMatrix<double> > H;
- am_sgmm.ComputeH(&H);
-
- BuildTreeStatsType stats;
- {
- bool binary_in;
- SgmmClusterable sc(am_sgmm, H); // dummy stats needed to provide
- // type info, and access to am_sgmm and H.
- Input ki(stats_filename, &binary_in);
- ReadBuildTreeStats(ki.Stream(), binary_in, sc, &stats);
- }
- KALDI_LOG << "Number of separate statistics is " << stats.size();
-
- Questions qo;
- {
- bool binary_in;
- try {
- Input ki(questions_filename, &binary_in);
- qo.Read(ki.Stream(), binary_in);
- } catch (const std::exception &e) {
- KALDI_ERR << "Error reading questions file "<<questions_filename<<", error is: " << e.what();
- }
- }
-
-
- std::vector<int32> phone2num_pdf_classes;
- topo.GetPhoneToNumPdfClasses(&phone2num_pdf_classes);
-
- EventMap *to_pdf = NULL;
-
- //////// Build the tree. ////////////
-
- to_pdf = BuildTree(qo,
- phone_sets,
- phone2num_pdf_classes,
- is_shared_root,
- is_split_root,
- stats,
- thresh,
- max_leaves,
- cluster_thresh,
- P);
-
- { // This block is to warn about low counts.
- std::vector<BuildTreeStatsType> split_stats;
- SplitStatsByMap(stats, *to_pdf,
- &split_stats);
- for (size_t i = 0; i < split_stats.size(); i++)
- if (SumNormalizer(split_stats[i]) < 100.0)
- KALDI_VLOG(1) << "For pdf-id " << i << ", low count "
- << SumNormalizer(split_stats[i]);
- }
-
- ContextDependency ctx_dep(N, P, to_pdf); // takes ownership
- // of pointer "to_pdf", so set it NULL.
- to_pdf = NULL;
-
- WriteKaldiObject(ctx_dep, tree_out_filename, binary);
-
- { // This block is just doing some checks.
-
- std::vector<int32> all_phones;
- for (size_t i = 0; i < phone_sets.size(); i++)
- all_phones.insert(all_phones.end(),
- phone_sets[i].begin(), phone_sets[i].end());
- SortAndUniq(&all_phones);
- if (all_phones != topo.GetPhones()) {
- std::ostringstream ss;
- WriteIntegerVector(ss, false, all_phones);
- ss << " vs. ";
- WriteIntegerVector(ss, false, topo.GetPhones());
- KALDI_WARN << "Mismatch between phone sets provided in roots file, and those in topology: " << ss.str();
- }
- std::vector<int32> seen_phones;
- PossibleValues(P, stats, &seen_phones); // get phones seen in the data.
-
- std::vector<int32> unseen_phones; // diagnostic.
- for (size_t i = 0; i < all_phones.size(); i++)
- if (!std::binary_search(seen_phones.begin(), seen_phones.end(), all_phones[i]))
- unseen_phones.push_back(all_phones[i]);
- for (size_t i = 0; i < seen_phones.size(); i++)
- if (!std::binary_search(all_phones.begin(), all_phones.end(), seen_phones[i]))
- KALDI_ERR << "Phone " << (seen_phones[i])
- << " appears in stats but is not listed in roots file.";
- if (!unseen_phones.empty()) {
- std::ostringstream ss;
- for (size_t i = 0; i < unseen_phones.size(); i++)
- ss << unseen_phones[i] << ' ';
- // Note, unseen phones is just a warning as in certain kinds of
- // systems, this can be OK (e.g. where phone encodes position and
- // stress information).
- KALDI_WARN << "Saw no stats for following phones: " << ss.str();
- }
- }
-
- KALDI_LOG << "Wrote tree";
-
- DeleteBuildTreeStats(&stats);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
diff --git a/src/sgmmbin/sgmm-calc-distances.cc b/src/sgmmbin/sgmm-calc-distances.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// sgmmbin/sgmm-calc-distances.cc
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
-
- const char *usage =
- "Compute matrix of approximated K-L divergences between states\n"
- "Only works properly if a single substate per state.\n"
- "Usage: sgmm-calc-distances [options] model-in occs-in distances-out\n";
-
- bool binary = true;
- ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode");
- po.Read(argc, argv);
-
- if (po.NumArgs() != 3) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_in_filename = po.GetArg(1),
- occs_in_filename = po.GetArg(2),
- distances_out_filename = po.GetArg(3);
-
-
- AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- TransitionModel trans_model;
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Vector<BaseFloat> occs;
- ReadKaldiObject(occs_in_filename, &occs);
-
- Matrix<BaseFloat> dists(am_sgmm.NumPdfs(), am_sgmm.NumPdfs());
- AmSgmmFunctions::ComputeDistances(am_sgmm, occs, &dists);
-
- Output ko(distances_out_filename, binary);
- dists.Write(ko.Stream(), binary);
-
- KALDI_LOG << "Wrote distances to " << distances_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-cluster-phones.cc b/src/sgmmbin/sgmm-cluster-phones.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-// sgmmbin/sgmm-cluster-phones.cc
-
-// Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "tree/build-tree.h"
-#include "tree/build-tree-utils.h"
-#include "tree/context-dep.h"
-#include "sgmm/sgmm-clusterable.h"
-#include "hmm/transition-model.h"
-#include "util/text-utils.h"
-
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Cluster phones (or sets of phones) into sets for various purposes\n"
- "Usage: sgmm-cluster-phones [options] <sgmm-in> <tree-stats-in> <phone-sets-in> <clustered-phones-out>\n"
- "e.g.: sgmm-cluster-phones 0.sgmm 1.tacc phonesets.txt questions.txt\n";
- // Format of phonesets.txt is e.g.
- // 1
- // 2 3 4
- // 5 6
- // ...
- // Format of questions.txt output is similar, but with more lines (and the same phone
- // may appear on multiple lines).
-
- // bool binary = true;
- int32 P = 1, N = 3; // Note: N does not matter.
- std::string pdf_class_list_str = "1"; // 1 is just the central position of 3.
- std::string mode = "questions";
- int32 num_classes = -1;
-
- ParseOptions po(usage);
- // po.Register("binary", &binary, "Write output in binary mode");
- po.Register("central-position", &P, "Central position in context window [must match acc-tree-stats]");
- po.Register("context-width", &N, "Does not have any effect-- included for scripting convenience.");
- po.Register("pdf-class-list", &pdf_class_list_str, "Colon-separated list of HMM positions to consider [Default = 1: just central position for 3-state models].");
- po.Register("mode", &mode, "Mode of operation: \"questions\"->sets suitable for decision trees; \"k-means\"->k-means algorithm, output k classes (set num-classes options)\n");
- po.Register("num-classes", &num_classes, "For k-means mode, number of classes.");
-
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string sgmm_rxfilename = po.GetArg(1),
- stats_rxfilename = po.GetArg(2),
- phone_sets_rxfilename = po.GetArg(3),
- phone_sets_wxfilename = po.GetArg(4);
-
- AmSgmm am_sgmm;
- {
- TransitionModel trans_model;
- bool binary_in;
- Input ki(sgmm_rxfilename, &binary_in);
- trans_model.Read(ki.Stream(), binary_in);
- am_sgmm.Read(ki.Stream(), binary_in);
- }
- std::vector<SpMatrix<double> > H;
- am_sgmm.ComputeH(&H);
-
- BuildTreeStatsType stats;
- { // Read tree stats.
- bool binary_in;
- SgmmClusterable sc(am_sgmm, H); // dummy needed to provide type and sgmm ref.
- Input ki(stats_rxfilename, &binary_in);
- ReadBuildTreeStats(ki.Stream(), binary_in, sc, &stats);
- }
- KALDI_LOG << "Number of separate states in stats is "
- << stats.size();
-
- std::vector<int32> pdf_class_list;
- if (!SplitStringToIntegers(pdf_class_list_str, ":", false, &pdf_class_list)
- || pdf_class_list.empty()) {
- KALDI_ERR << "Invalid pdf-class-list string [expecting colon-separated list of integers]: "
- << pdf_class_list_str;
- }
-
- std::vector<std::vector< int32> > phone_sets;
- if (!ReadIntegerVectorVectorSimple(phone_sets_rxfilename, &phone_sets))
- KALDI_ERR << "Could not read phone sets from "
- << PrintableRxfilename(phone_sets_rxfilename);
-
- if (phone_sets.size() == 0)
- KALDI_ERR << "No phone sets in phone sets file ";
-
- std::vector<std::vector<int32> > phone_sets_out;
-
- if (mode == "questions") {
- if (num_classes != -1)
- KALDI_ERR << "num-classes option is not (currently) compatible "
- "with \"questions\" mode.";
- AutomaticallyObtainQuestions(stats,
- phone_sets,
- pdf_class_list,
- P,
- &phone_sets_out);
- } else if (mode == "k-means") {
- if (num_classes <= 1 ||
- static_cast<size_t>(num_classes) > phone_sets.size())
- KALDI_ERR << "num-classes invalid: num_classes is " << num_classes
- << ", number of phone sets is " << phone_sets.size();
- KMeansClusterPhones(stats,
- phone_sets,
- pdf_class_list,
- P,
- num_classes,
- &phone_sets_out);
- }
-
- if (!WriteIntegerVectorVectorSimple(phone_sets_wxfilename, phone_sets_out))
- KALDI_ERR << "Error writing questions to "
- << PrintableWxfilename(phone_sets_wxfilename);
- else
- KALDI_LOG << "Wrote questions to "<<phone_sets_wxfilename;
-
- DeleteBuildTreeStats(&stats);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
diff --git a/src/sgmmbin/sgmm-comp-prexform.cc b/src/sgmmbin/sgmm-comp-prexform.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// sgmmbin/sgmm-comp-prexform.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/fmllr-sgmm.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Compute \"pre-transform\" parameters required for estimating fMLLR with\n"
- "SGMMs, and write to a model file, after the SGMM.\n"
- "Usage: sgmm-comp-prexform [options] <sgmm-in> <occs-in> <sgmm-out>\n";
-
- bool binary = true;
- kaldi::ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 3) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string sgmm_in_filename = po.GetArg(1),
- occs_filename = po.GetArg(2),
- sgmm_out_filename = po.GetArg(3);
-
- kaldi::AmSgmm sgmm_in;
- kaldi::TransitionModel trans_model;
- {
- bool binary_read;
- kaldi::Input ki(sgmm_in_filename, &binary_read);
- trans_model.Read(ki.Stream(), binary_read);
- sgmm_in.Read(ki.Stream(), binary_read);
- }
-
- kaldi::Vector<kaldi::BaseFloat> occs;
- {
- bool binary_read;
- kaldi::Input ki(occs_filename, &binary_read);
- occs.Read(ki.Stream(), binary_read);
- }
-
- kaldi::SgmmFmllrGlobalParams fmllr_globals;
- sgmm_in.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_,
- &fmllr_globals.inv_xform_,
- &fmllr_globals.mean_scatter_);
-
- {
- kaldi::Output ko(sgmm_out_filename, binary);
- trans_model.Write(ko.Stream(), binary);
- sgmm_in.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
- fmllr_globals.Write(ko.Stream(), binary);
- }
-
- KALDI_LOG << "Written model to " << sgmm_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-copy.cc b/src/sgmmbin/sgmm-copy.cc
--- a/src/sgmmbin/sgmm-copy.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// sgmmbin/sgmm-copy.cc
-
-// Copyright 2009-2012 Microsoft Corporation
-// Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- const char *usage =
- "Copy SGMM (possibly changing binary/text format)\n"
- "Usage: sgmm-copy [options] <model-in> <model-out>\n"
- "e.g.: sgmm-copy --binary=false 1.mdl 1_text.mdl\n";
-
- bool binary_write = true;
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
-
- po.Read(argc, argv);
- if (po.NumArgs() != 2) {
- po.PrintUsage();
- exit(1);
- }
- std::string model_in_filename = po.GetArg(1),
- model_out_filename = po.GetArg(2);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- {
- Output ko(model_out_filename, binary_write);
- trans_model.Write(ko.Stream(), binary_write);
- am_sgmm.Write(ko.Stream(), binary_write, kSgmmWriteAll);
- }
-
-
- KALDI_LOG << "Written model to " << model_out_filename;
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-decode-faster.cc b/src/sgmmbin/sgmm-decode-faster.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// sgmmbin/sgmm-decode-faster.cc
-
-// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/faster-decoder.h"
-#include "sgmm/decodable-am-sgmm.h"
-#include "base/timer.h"
-#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- using fst::SymbolTable;
- using fst::VectorFst;
- using fst::StdArc;
-
- const char *usage =
- "Decode features using SGMM-based model.\n"
- "Usage: sgmm-decode-faster [options] <model-in> <fst-in> "
- "<features-rspecifier> <words-wspecifier> [alignments-wspecifier]\n";
- ParseOptions po(usage);
- bool allow_partial = true;
- BaseFloat acoustic_scale = 0.1;
- BaseFloat log_prune = 5.0;
- string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
- utt2spk_rspecifier;
-
- FasterDecoderOptions decoder_opts;
- decoder_opts.Register(&po, true); // true == include obscure settings.
- kaldi::SgmmGselectConfig sgmm_opts;
- sgmm_opts.Register(&po);
-
- po.Register("acoustic-scale", &acoustic_scale,
- "Scaling factor for acoustic likelihoods");
- po.Register("log-prune", &log_prune,
- "Pruning beam used to reduce number of exp() evaluations.");
- po.Register("word-symbol-table", &word_syms_filename,
- "Symbol table for words [for debug output]");
- po.Register("gselect", &gselect_rspecifier,
- "rspecifier for precomputed per-frame Gaussian indices.");
- po.Register("spk-vecs", &spkvecs_rspecifier,
- "rspecifier for speaker vectors");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("allow-partial", &allow_partial,
- "Produce output even when final state was not reached");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 4 || po.NumArgs() > 5) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_in_filename = po.GetArg(1),
- fst_in_filename = po.GetArg(2),
- feature_rspecifier = po.GetArg(3),
- words_wspecifier = po.GetArg(4),
- alignment_wspecifier = po.GetOptArg(5);
-
- TransitionModel trans_model;
- kaldi::AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Int32VectorWriter words_writer(words_wspecifier);
- Int32VectorWriter alignment_writer(alignment_wspecifier);
-
- fst::SymbolTable *word_syms = NULL;
- if (word_syms_filename != "")
- if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
- KALDI_ERR << "Could not read symbol table from file "
- << word_syms_filename;
-
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
- // It's important that we initialize decode_fst after feature_reader, as it
- // can prevent crashes on systems installed without enough virtual memory.
- // It has to do with what happens on UNIX systems if you call fork() on a
- // large process: the page-table entries are duplicated, which requires a
- // lot of virtual memory.
- VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_filename);
-
- BaseFloat tot_like = 0.0;
- kaldi::int64 frame_count = 0;
- int num_success = 0, num_fail = 0;
- FasterDecoder decoder(*decode_fst, decoder_opts);
-
- Timer timer;
- const std::vector<std::vector<int32> > empty_gselect;
-
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- Matrix<BaseFloat> features(feature_reader.Value());
- feature_reader.FreeCurrent();
- if (features.NumRows() == 0) {
- KALDI_WARN << "Zero-length utterance: " << utt;
- num_fail++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_fail++;
- continue;
- }
- } // else spk_vars is "empty"
-
- bool has_gselect = false;
- if (gselect_reader.IsOpen()) {
- has_gselect = gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == features.NumRows();
- if (!has_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- }
- const std::vector<std::vector<int32> > *gselect =
- (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars,
- trans_model, features, *gselect,
- log_prune, acoustic_scale);
- decoder.Decode(&sgmm_decodable);
-
- VectorFst<LatticeArc> decoded; // linear FST.
-
- if ( (allow_partial || decoder.ReachedFinal())
- && decoder.GetBestPath(&decoded) ) {
- if (!decoder.ReachedFinal())
- KALDI_WARN << "Decoder did not reach end-state, "
- << "outputting partial traceback since --allow-partial=true";
- num_success++;
- std::vector<int32> alignment;
- std::vector<int32> words;
- LatticeWeight weight;
- frame_count += features.NumRows();
-
- GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-
- words_writer.Write(utt, words);
- if (alignment_writer.IsOpen())
- alignment_writer.Write(utt, alignment);
- if (word_syms != NULL) {
- std::cerr << utt << ' ';
- for (size_t i = 0; i < words.size(); i++) {
- std::string s = word_syms->Find(words[i]);
- if (s == "")
- KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
- std::cerr << s << ' ';
- }
- std::cerr << '\n';
- }
- BaseFloat like = -weight.Value1() -weight.Value2();
- tot_like += like;
- KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
- << (like / features.NumRows()) << " over "
- << features.NumRows() << " frames.";
- } else {
- num_fail++;
- KALDI_WARN << "Did not successfully decode utterance " << utt
- << ", len = " << features.NumRows();
- }
- }
- double elapsed = timer.Elapsed();
- KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
- << "s: real-time factor assuming 100 frames/sec is "
- << (elapsed*100.0/frame_count);
- KALDI_LOG << "Done " << num_success << " utterances, failed for "
- << num_fail;
- KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
- << " over " << frame_count << " frames.";
-
- delete word_syms;
- delete decode_fst;
- return (num_success != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
diff --git a/src/sgmmbin/sgmm-est-ebw.cc b/src/sgmmbin/sgmm-est-ebw.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// sgmmbin/sgmm-est-ebw.cc
-
-// Copyright 2012 Johns Hopkins Univerity (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "thread/kaldi-thread.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm-ebw.h"
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- using std::string;
- try {
- const char *usage =
- "Estimate SGMM model parameters discriminatively using Extended\n"
- "Baum-Welch style of update\n"
- "Usage: sgmm-est-ebw [options] <model-in> <num-stats-in> <den-stats-in> <model-out>\n";
-
-
- string update_flags_str = "vMNwcSt";
- bool binary_write = true;
- string write_flags_str = "gsnu";
- EbwAmSgmmOptions opts;
-
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
- "update: subset of vMNwcSt.");
- po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
- "write: subset of gsnu");
- po.Register("num-threads", &g_num_threads, "Number of threads to use in "
- "weight update and normalizer computation");
- opts.Register(&po);
-
- po.Read(argc, argv);
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
- string model_in_filename = po.GetArg(1),
- num_stats_filename = po.GetArg(2),
- den_stats_filename = po.GetArg(3),
- model_out_filename = po.GetArg(4);
-
- SgmmUpdateFlagsType update_flags = StringToSgmmUpdateFlags(update_flags_str);
- SgmmWriteFlagsType write_flags = StringToSgmmWriteFlags(write_flags_str);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- MleAmSgmmAccs sgmm_num_accs;
- {
- bool binary;
- Vector<double> transition_accs; // won't be used.
- Input ki(num_stats_filename, &binary);
- transition_accs.Read(ki.Stream(), binary);
- sgmm_num_accs.Read(ki.Stream(), binary, false); // false == add; doesn't matter.
- }
- MleAmSgmmAccs sgmm_den_accs;
- {
- bool binary;
- Vector<double> transition_accs; // won't be used.
- Input ki(den_stats_filename, &binary);
- transition_accs.Read(ki.Stream(), binary);
- sgmm_den_accs.Read(ki.Stream(), binary, false); // false == add; doesn't matter.
- }
-
- sgmm_num_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
- sgmm_den_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
-
- { // Update SGMM.
- BaseFloat auxf_impr, count;
- kaldi::EbwAmSgmmUpdater sgmm_updater(opts);
- sgmm_updater.Update(sgmm_num_accs, sgmm_den_accs, &am_sgmm,
- update_flags, &auxf_impr, &count);
- KALDI_LOG << "Overall auxf impr/frame from SGMM update is " << (auxf_impr/count)
- << " over " << count << " frames.";
- }
-
- {
- Output ko(model_out_filename, binary_write);
- trans_model.Write(ko.Stream(), binary_write);
- am_sgmm.Write(ko.Stream(), binary_write, write_flags);
- }
-
- KALDI_LOG << "Wrote model to " << model_out_filename;
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
diff --git a/src/sgmmbin/sgmm-est-fmllr-gpost.cc b/src/sgmmbin/sgmm-est-fmllr-gpost.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-// sgmmbin/sgmm-est-fmllr-gpost.cc
-
-// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/fmllr-sgmm.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
- const SgmmGauPost &gpost,
- const TransitionModel &trans_model,
- const AmSgmm &am_sgmm,
- const SgmmPerSpkDerivedVars &spk_vars,
- BaseFloat logdet,
- FmllrSgmmAccs *spk_stats) {
-// kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- for (size_t i = 0; i < gpost.size(); i++) {
-// am_sgmm.ComputePerFrameVars(feats.Row(i), gpost[i].gselect, spk_vars,
-// logdet, &per_frame_vars);
-
- for (size_t j = 0; j < gpost[i].tids.size(); j++) {
- int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i].tids[j]);
- spk_stats->AccumulateFromPosteriors(am_sgmm, spk_vars, feats.Row(i),
- gpost[i].gselect,
- gpost[i].posteriors[j], pdf_id);
- }
- }
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
- using namespace kaldi;
- const char *usage =
- "Estimate FMLLR transform for SGMMs, either per utterance or for the "
- "supplied set of speakers (with spk2utt option).\n"
- "Reads Gaussian-level posteriors. Writes to a table of matrices.\n"
- "Usage: sgmm-est-fmllr-gpost [options] <model-in> <feature-rspecifier> "
- "<gpost-rspecifier> <mats-wspecifier>\n";
-
- ParseOptions po(usage);
- string spk2utt_rspecifier, spkvecs_rspecifier, fmllr_rspecifier;
- BaseFloat min_count = 100;
- SgmmFmllrConfig fmllr_opts;
-
- po.Register("spk2utt", &spk2utt_rspecifier,
- "File to read speaker to utterance-list map from.");
- po.Register("spkvec-min-count", &min_count,
- "Minimum count needed to estimate speaker vectors");
- po.Register("spk-vecs", &spkvecs_rspecifier,
- "Speaker vectors to use during aligment (rspecifier)");
- po.Register("input-fmllr", &fmllr_rspecifier,
- "Initial FMLLR transform per speaker (rspecifier)");
- fmllr_opts.Register(&po);
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- string model_rxfilename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- gpost_rspecifier = po.GetArg(3),
- fmllr_wspecifier = po.GetArg(4);
-
- TransitionModel trans_model;
- AmSgmm am_sgmm;
- SgmmFmllrGlobalParams fmllr_globals;
- {
- bool binary;
- Input ki(model_rxfilename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- fmllr_globals.Read(ki.Stream(), binary);
- }
-
- RandomAccessSgmmGauPostReader gpost_reader(gpost_rspecifier);
-
- RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
- RandomAccessBaseFloatMatrixReader fmllr_reader(fmllr_rspecifier);
-
- BaseFloatMatrixWriter fmllr_writer(fmllr_wspecifier);
-
- int32 dim = am_sgmm.FeatureDim();
- FmllrSgmmAccs spk_stats;
- spk_stats.Init(dim, am_sgmm.NumGauss());
- Matrix<BaseFloat> fmllr_xform(dim, dim + 1);
- BaseFloat logdet = 0.0;
- double tot_impr = 0.0, tot_t = 0.0;
- int32 num_done = 0, num_no_gpost = 0, num_other_error = 0;
- std::vector<std::vector<int32> > empty_gselect;
-
- if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation
- SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
- for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
- spk_stats.SetZero();
- string spk = spk2utt_reader.Key();
- const vector<string> &uttlist = spk2utt_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(spk)) {
- spk_vars.v_s = spkvecs_reader.Value(spk);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << spk;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- if (fmllr_reader.IsOpen()) {
- if (fmllr_reader.HasKey(spk)) {
- fmllr_xform.CopyFromMat(fmllr_reader.Value(spk));
- logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
- } else {
- KALDI_WARN << "Cannot find FMLLR transform for " << spk;
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
- } else {
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
-
- for (size_t i = 0; i < uttlist.size(); i++) {
- std::string utt = uttlist[i];
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find features for utterance " << utt;
- continue;
- }
- if (!gpost_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posteriors for utterance " << utt;
- num_no_gpost++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
- const SgmmGauPost &gpost = gpost_reader.Value(utt);
- if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
- KALDI_WARN << "gpost vector has wrong size " << (gpost.size())
- << " vs. " << (feats.NumRows());
- num_other_error++;
- continue;
- }
-
- AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars,
- logdet, &spk_stats);
- num_done++;
- } // end looping over all utterances of the current speaker
-
- BaseFloat impr, spk_frame_count;
- // Compute the FMLLR transform and write it out.
- spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
- &spk_frame_count, &impr);
- fmllr_writer.Write(spk, fmllr_xform);
- tot_impr += impr;
- tot_t += spk_frame_count;
- } // end looping over speakers
- } else { // per-utterance adaptation
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- if (!gpost_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posts for utterance "
- << utt;
- num_no_gpost++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- if (fmllr_reader.IsOpen()) {
- if (fmllr_reader.HasKey(utt)) {
- fmllr_xform.CopyFromMat(fmllr_reader.Value(utt));
- logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
- } else {
- KALDI_WARN << "Cannot find FMLLR transform for " << utt;
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
- } else {
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
-
- const SgmmGauPost &gpost = gpost_reader.Value(utt);
-
- if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
- KALDI_WARN << "gpost has wrong size " << (gpost.size())
- << " vs. " << (feats.NumRows());
- num_other_error++;
- continue;
- }
- spk_stats.SetZero();
- AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars,
- logdet, &spk_stats);
- num_done++;
-
- BaseFloat impr, spk_frame_count;
- // Compute the FMLLR transform and write it out.
- spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
- &spk_frame_count, &impr);
- fmllr_writer.Write(utt, fmllr_xform);
- tot_impr += impr;
- tot_t += spk_frame_count;
- }
- }
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_gpost
- << " with no gposts, " << num_other_error << " with other errors.";
- KALDI_LOG << "Num frames " << tot_t << ", auxf impr per frame is "
- << (tot_impr / tot_t);
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
diff --git a/src/sgmmbin/sgmm-est-fmllr.cc b/src/sgmmbin/sgmm-est-fmllr.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-// sgmmbin/sgmm-est-fmllr.cc
-
-// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-// 2014 Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/fmllr-sgmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
- const Matrix<BaseFloat> &transformed_feats, // if already fMLLR
- const std::vector<std::vector<int32> > &gselect,
- const SgmmGselectConfig &sgmm_config,
- const Posterior &post,
- const TransitionModel &trans_model,
- const AmSgmm &am_sgmm,
- const SgmmPerSpkDerivedVars &spk_vars,
- BaseFloat logdet,
- FmllrSgmmAccs *spk_stats) {
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- Posterior pdf_post;
- ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
- for (size_t t = 0; t < post.size(); t++) {
- std::vector<int32> this_gselect;
- if (!gselect.empty()) {
- KALDI_ASSERT(t < gselect.size());
- this_gselect = gselect[t];
- } else {
- am_sgmm.GaussianSelection(sgmm_config, feats.Row(t), &this_gselect);
- }
- // per-frame vars only used for computing posteriors... use the
- // transformed feats for this, if available.
- am_sgmm.ComputePerFrameVars(transformed_feats.Row(t), this_gselect, spk_vars,
- 0.0 /*fMLLR logdet*/, &per_frame_vars);
-
-
- for (size_t j = 0; j < pdf_post[t].size(); j++) {
- int32 pdf_id = pdf_post[t][j].first;
- Matrix<BaseFloat> posteriors;
- am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id,
- &posteriors);
- posteriors.Scale(pdf_post[t][j].second);
- spk_stats->AccumulateFromPosteriors(am_sgmm, spk_vars, feats.Row(t),
- this_gselect,
- posteriors, pdf_id);
- }
- }
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
- using namespace kaldi;
- const char *usage =
- "Estimate FMLLR transform for SGMMs, either per utterance or for the "
- "supplied set of speakers (with spk2utt option).\n"
- "Reads state-level posteriors. Writes to a table of matrices.\n"
- "Usage: sgmm-est-fmllr [options] <model-in> <feature-rspecifier> "
- "<post-rspecifier> <mats-wspecifier>\n";
-
- ParseOptions po(usage);
- string spk2utt_rspecifier, spkvecs_rspecifier, fmllr_rspecifier,
- gselect_rspecifier;
- BaseFloat min_count = 100;
- SgmmFmllrConfig fmllr_opts;
- SgmmGselectConfig sgmm_opts;
-
- po.Register("spk2utt", &spk2utt_rspecifier,
- "File to read speaker to utterance-list map from.");
- po.Register("spkvec-min-count", &min_count,
- "Minimum count needed to estimate speaker vectors");
- po.Register("spk-vecs", &spkvecs_rspecifier,
- "Speaker vectors to use during aligment (rspecifier)");
- po.Register("input-fmllr", &fmllr_rspecifier,
- "Initial FMLLR transform per speaker (rspecifier)");
- po.Register("gselect", &gselect_rspecifier,
- "Precomputed Gaussian indices (rspecifier)");
- fmllr_opts.Register(&po);
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- string model_rxfilename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- post_rspecifier = po.GetArg(3),
- fmllr_wspecifier = po.GetArg(4);
-
- TransitionModel trans_model;
- AmSgmm am_sgmm;
- SgmmFmllrGlobalParams fmllr_globals;
- {
- bool binary;
- Input ki(model_rxfilename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- fmllr_globals.Read(ki.Stream(), binary);
- }
-
- RandomAccessPosteriorReader post_reader(post_rspecifier);
- RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatMatrixReader fmllr_reader(fmllr_rspecifier);
-
- BaseFloatMatrixWriter fmllr_writer(fmllr_wspecifier);
-
- int32 dim = am_sgmm.FeatureDim();
- FmllrSgmmAccs spk_stats;
- spk_stats.Init(dim, am_sgmm.NumGauss());
- Matrix<BaseFloat> fmllr_xform(dim, dim + 1);
- BaseFloat logdet = 0.0;
- double tot_impr = 0.0, tot_t = 0.0;
- int32 num_done = 0, num_err = 0;
- std::vector<std::vector<int32> > empty_gselect;
-
- if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation
- SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
- for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
- spk_stats.SetZero();
- string spk = spk2utt_reader.Key();
- const vector<string> &uttlist = spk2utt_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(spk)) {
- spk_vars.v_s = spkvecs_reader.Value(spk);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << spk;
- num_err++;
- continue;
- }
- } // else spk_vars is "empty"
-
- if (fmllr_reader.IsOpen()) {
- if (fmllr_reader.HasKey(spk)) {
- fmllr_xform.CopyFromMat(fmllr_reader.Value(spk));
- logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
- } else {
- KALDI_WARN << "Cannot find FMLLR transform for " << spk;
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
- } else {
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
-
- for (size_t i = 0; i < uttlist.size(); i++) {
- std::string utt = uttlist[i];
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find features for utterance " << utt;
- num_err++;
- continue;
- }
- if (!post_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posteriors for utterance " << utt;
- num_err++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
- const Posterior &post = post_reader.Value(utt);
- if (static_cast<int32>(post.size()) != feats.NumRows()) {
- KALDI_WARN << "posterior vector has wrong size " << (post.size())
- << " vs. " << (feats.NumRows());
- num_err++;
- continue;
- }
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == feats.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- Matrix<BaseFloat> transformed_feats(feats);
- for (int32 r = 0; r < transformed_feats.NumRows(); r++) {
- SubVector<BaseFloat> row(transformed_feats, r);
- ApplyAffineTransform(fmllr_xform, &row);
- }
- AccumulateForUtterance(feats, transformed_feats, *gselect, sgmm_opts,
- post, trans_model, am_sgmm, spk_vars,
- logdet, &spk_stats);
- num_done++;
- } // end looping over all utterances of the current speaker
-
- BaseFloat impr, spk_frame_count;
- // Compute the FMLLR transform and write it out.
- spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
- &spk_frame_count, &impr);
- fmllr_writer.Write(spk, fmllr_xform);
- tot_impr += impr;
- tot_t += spk_frame_count;
- } // end looping over speakers
- } else { // per-utterance adaptation
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- if (!post_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posts for utterance "
- << utt;
- num_err++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_err++;
- continue;
- }
- } // else spk_vars is "empty"
-
- if (fmllr_reader.IsOpen()) {
- if (fmllr_reader.HasKey(utt)) {
- fmllr_xform.CopyFromMat(fmllr_reader.Value(utt));
- logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet();
- } else {
- KALDI_WARN << "Cannot find FMLLR transform for " << utt;
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
- } else {
- fmllr_xform.SetUnit();
- logdet = 0.0;
- }
-
- const Posterior &post = post_reader.Value(utt);
-
- if (static_cast<int32>(post.size()) != feats.NumRows()) {
- KALDI_WARN << "post has wrong size " << (post.size())
- << " vs. " << (feats.NumRows());
- num_err++;
- continue;
- }
- spk_stats.SetZero();
-
- Matrix<BaseFloat> transformed_feats(feats);
- for (int32 r = 0; r < transformed_feats.NumRows(); r++) {
- SubVector<BaseFloat> row(transformed_feats, r);
- ApplyAffineTransform(fmllr_xform, &row);
- }
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == feats.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- AccumulateForUtterance(feats, transformed_feats, *gselect, sgmm_opts,
- post, trans_model, am_sgmm, spk_vars,
- logdet, &spk_stats);
- num_done++;
-
- BaseFloat impr, spk_frame_count;
- // Compute the FMLLR transform and write it out.
- spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform,
- &spk_frame_count, &impr);
- fmllr_writer.Write(utt, fmllr_xform);
- tot_impr += impr;
- tot_t += spk_frame_count;
- }
- }
-
- KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors.";
- KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t)
- << " per frame, over " << tot_t << " frames.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
diff --git a/src/sgmmbin/sgmm-est-fmllrbasis.cc b/src/sgmmbin/sgmm-est-fmllrbasis.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// sgmmbin/sgmm-est-fmllrbasis.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "matrix/matrix-lib.h"
-#include "hmm/transition-model.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/fmllr-sgmm.h"
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Sum multiple accumulated stats files for SGMM training.\n"
- "Usage: sgmm-est-fmllrbasis [options] <model-in> <model-out> "
- "<stats-in1> [stats-in2 ...]\n";
-
- bool binary = true;
- int32 num_bases = 50;
- kaldi::ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode.");
- po.Register("num-bases", &num_bases,
- "Number of fMLLR basis matrices to estimate.");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 3) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_in_filename = po.GetArg(1),
- model_out_filename = po.GetArg(2);
-
- kaldi::AmSgmm am_sgmm;
- kaldi::TransitionModel trans_model;
- kaldi::SgmmFmllrGlobalParams fmllr_globals;
- {
- bool binary_read;
- kaldi::Input ki(model_in_filename, &binary_read);
- trans_model.Read(ki.Stream(), binary_read);
- am_sgmm.Read(ki.Stream(), binary_read);
- fmllr_globals.Read(ki.Stream(), binary_read);
- }
-
- kaldi::SpMatrix<double> fmllr_grad_scatter;
- int32 dim = am_sgmm.FeatureDim();
- fmllr_grad_scatter.Resize(dim * (dim + 1), kaldi::kSetZero);
-
- for (int i = 3, max = po.NumArgs(); i <= max; i++) {
- std::string stats_in_filename = po.GetArg(i);
- bool binary_read;
- kaldi::Input ki(stats_in_filename, &binary_read);
- fmllr_grad_scatter.Read(ki.Stream(), binary_read,
- true /* add read values */);
- }
-
- kaldi::EstimateSgmmFmllrSubspace(fmllr_grad_scatter, num_bases, dim,
- &fmllr_globals);
-
- // Write out the accs
- {
- kaldi::Output ko(model_out_filename, binary);
- trans_model.Write(ko.Stream(), binary);
- am_sgmm.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
- fmllr_globals.Write(ko.Stream(), binary);
- }
-
- KALDI_LOG << "Written model to " << model_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-est-multi.cc b/src/sgmmbin/sgmm-est-multi.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-// sgmmbin/sgmm-est-multi.cc
-
-// Copyright 2009-2012 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "sgmm/estimate-am-sgmm-multi.h"
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- // Memory for these will be freed in the catch block in case of exceptions.
- std::vector<AmSgmm*> sgmms_in;
- std::vector<MleAmSgmmAccs*> sgmm_accs_in;
- std::vector<TransitionModel*> trans_models_in;
-
- try {
- typedef kaldi::int32 int32;
- const char *usage =
- "Estimate multiple SGMM models from corresponding stats, such that the"
- " global parameters\n(phone-, speaker-, and weight-projections and "
- "covariances) are tied across models.\n"
- "Usage: sgmm-est-multi [options] <model1> <stats1> <model1_out> <occs1_out> [<model2> "
- "<stats2> <model2_out> <occs2_out> ...]\n";
-
- bool binary_write = true;
- std::string update_flags_str = "vMNwcSt";
- std::string write_flags_str = "gsnu";
- kaldi::MleTransitionUpdateConfig tcfg;
- kaldi::MleAmSgmmOptions sgmm_opts;
- std::string split_substates = ""; // Space-seperated list of #substates
- std::vector<int32> split_substates_int; // The above string split on space
- int32 increase_phn_dim = 0;
- int32 increase_spk_dim = 0;
- bool remove_speaker_space = false;
- BaseFloat perturb_factor = 0.01;
- BaseFloat power = 0.2;
- BaseFloat max_cond = 100;
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
- // The split-substates option also takes a single integer: the same number
- // of substates for all models.
- po.Register("split-substates", &split_substates, "Space-separated string "
- "with target number of substates for each model.");
- po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space "
- "dimension as far as allowed towards this target.");
- po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space "
- "dimension as far as allowed towards this target.");
- po.Register("remove-speaker-space", &remove_speaker_space,
- "Remove speaker-specific projections N");
- po.Register("power", &power, "Exponent for substate occupancies used while "
- "splitting substates.");
- po.Register("perturb-factor", &perturb_factor, "Perturbation factor for "
- "state vectors while splitting substates.");
- po.Register("max-cond-split", &max_cond, "Max condition number of smoothing "
- "matrix used in substate splitting.");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
- "update: subset of vMNwcSt.");
- po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
- "write: subset of gsnu");
- tcfg.Register(&po);
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
- if (po.NumArgs() <= 0 || (po.NumArgs() % 4 != 0)) {
- po.PrintUsage();
- exit(1);
- }
- // How many 4-tuples of model, stats, output model, output occs
- int32 num_models = po.NumArgs()/4;
- sgmms_in.resize(num_models, NULL);
- sgmm_accs_in.resize(num_models, NULL);
- trans_models_in.resize(num_models, NULL);
-
- if (!split_substates.empty()) {
- SplitStringToIntegers(split_substates, " ", true /*omit empty strings*/,
- &split_substates_int);
- if (split_substates_int.size() == 1) { // Same #substates for all models
- int32 tmp_int = split_substates_int[0];
- split_substates_int.resize(num_models, tmp_int);
- }
- if (split_substates_int.size() != num_models) {
- KALDI_ERR << "Found " << split_substates_int.size() << " splitting "
- << "targets; expecting 1 or " << num_models;
- }
- }
-
- SgmmUpdateFlagsType update_flags = StringToSgmmUpdateFlags(update_flags_str);
- SgmmWriteFlagsType write_flags = StringToSgmmWriteFlags(write_flags_str);
-
- std::vector<std::string> model_out_filenames(num_models);
- std::vector<std::string> occs_out_filenames(num_models);
- int32 phn_dim, spk_dim, num_gauss, feat_dim;
-
- for (int i = 0; i < num_models; ++i) {
- std::string model_in_filename = po.GetArg(i*4+1),
- stats_filename = po.GetArg(i*4+2);
- model_out_filenames[i] = po.GetArg(i*4+3);
- occs_out_filenames[i] = po.GetArg(i*4+4);
-
- AmSgmm *am_sgmm = new AmSgmm();
- TransitionModel *trans_model = new TransitionModel();
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model->Read(ki.Stream(), binary);
- am_sgmm->Read(ki.Stream(), binary);
- }
- if (i == 0) {
- phn_dim = am_sgmm->PhoneSpaceDim();
- spk_dim = am_sgmm->SpkSpaceDim();
- num_gauss = am_sgmm->NumGauss();
- feat_dim = am_sgmm->FeatureDim();
- } else {
- if (am_sgmm->PhoneSpaceDim() != phn_dim) {
- KALDI_ERR << "File '" << model_in_filename << "': mismatched "
- << "phone-space dim: expecting " << phn_dim << ", found "
- << am_sgmm->PhoneSpaceDim();
- }
- if (am_sgmm->SpkSpaceDim() != spk_dim) {
- KALDI_ERR << "File '" << model_in_filename << "': mismatched "
- << "speaker-space dim: expecting " << spk_dim << ", found "
- << am_sgmm->SpkSpaceDim();
- }
- if (am_sgmm->NumGauss() != num_gauss) {
- KALDI_ERR << "File '" << model_in_filename << "': mismatched UBM "
- << "size: expecting " << num_gauss << ", found "
- << am_sgmm->NumGauss();
- }
- if (am_sgmm->FeatureDim() != feat_dim) {
- KALDI_ERR << "File '" << model_in_filename << "': mismatched feature "
- << "dim: expecting " << feat_dim << ", found "
- << am_sgmm->FeatureDim();
- }
- }
- sgmms_in[i] = am_sgmm;
- trans_models_in[i] = trans_model;
-
- Vector<double> transition_accs;
- MleAmSgmmAccs *sgmm_accs = new MleAmSgmmAccs();
- {
- bool binary;
- Input ki(stats_filename, &binary);
- transition_accs.Read(ki.Stream(), binary);
- sgmm_accs->Read(ki.Stream(), binary, false);
- }
- // Check consistency and print some diagnostics.
- sgmm_accs->Check(*am_sgmm, true);
- sgmm_accs_in[i] = sgmm_accs;
-
- if (update_flags & kSgmmTransitions) { // Update transition model.
- BaseFloat objf_impr, count;
- KALDI_LOG << "Updating transitions for model: " << model_in_filename;
- trans_model->MleUpdate(transition_accs, tcfg, &objf_impr, &count);
- KALDI_LOG << "Transition model update: average " << (objf_impr/count)
- << " log-like improvement per frame over " << (count)
- << " frames";
- }
- }
-
- { // Update all the SGMMs together.
- kaldi::MleAmSgmmUpdaterMulti multi_sgmm_updater(*sgmms_in[0], sgmm_opts);
- multi_sgmm_updater.Update(sgmm_accs_in, sgmms_in, update_flags);
- }
-
- for (int i = 0; i < num_models; ++i) {
- Vector<BaseFloat> state_occs;
- sgmm_accs_in[i]->GetStateOccupancies(&state_occs);
-
- if (!split_substates.empty()) {
- sgmms_in[i]->SplitSubstates(state_occs, split_substates_int[i], perturb_factor,
- power, max_cond);
- sgmms_in[i]->ComputeDerivedVars(); // recompute normalizers...
- }
-
- {
- kaldi::Output ko(occs_out_filenames[i], false /* no binary write */);
- state_occs.Write(ko.Stream(), false /* no binary write */);
- }
-
- if (increase_phn_dim != 0 || increase_spk_dim != 0) {
- // Feature normalizing transform matrix used to initialize the new columns
- // of the phonetic- or speaker-space projection matrices.
- kaldi::Matrix<BaseFloat> norm_xform;
- ComputeFeatureNormalizer(sgmms_in[i]->full_ubm(), &norm_xform);
- if (increase_phn_dim != 0)
- sgmms_in[i]->IncreasePhoneSpaceDim(increase_phn_dim, norm_xform);
- if (increase_spk_dim != 0)
- sgmms_in[i]->IncreaseSpkSpaceDim(increase_spk_dim, norm_xform);
- }
- if (remove_speaker_space) {
- KALDI_LOG << "Removing speaker space (projections N_)";
- sgmms_in[i]->RemoveSpeakerSpace();
- }
-
- {
- Output ko(model_out_filenames[i], binary_write);
- trans_models_in[i]->Write(ko.Stream(), binary_write);
- sgmms_in[i]->Write(ko.Stream(), binary_write, write_flags);
- KALDI_LOG << "Written model to " << model_out_filenames[i];
- }
- }
- return 0;
- } catch(const std::exception& e) {
- kaldi::DeletePointers(&sgmms_in);
- kaldi::DeletePointers(&sgmm_accs_in);
- kaldi::DeletePointers(&trans_models_in);
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-est-spkvecs-gpost.cc b/src/sgmmbin/sgmm-est-spkvecs-gpost.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// sgmmbin/sgmm-est-spkvecs-gpost.cc
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
- const SgmmGauPost &gpost,
- const TransitionModel &trans_model,
- const AmSgmm &am_sgmm,
- const SgmmPerSpkDerivedVars &spk_vars,
- MleSgmmSpeakerAccs *spk_stats) {
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- for (size_t i = 0; i < gpost.size(); i++) {
- am_sgmm.ComputePerFrameVars(feats.Row(i),
- gpost[i].gselect, spk_vars, 0.0,
- &per_frame_vars);
-
- for (size_t j = 0; j < gpost[i].tids.size(); j++) {
- int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i].tids[j]);
- spk_stats->AccumulateFromPosteriors(am_sgmm, per_frame_vars,
- gpost[i].posteriors[j], pdf_id);
- }
- }
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
- using namespace kaldi;
- const char *usage =
- "Estimate SGMM speaker vectors, either per utterance or for the "
- "supplied set of speakers (with spk2utt option).\n"
- "Reads Gaussian-level posteriors. Writes to a table of vectors.\n"
- "Usage: sgmm-est-spkvecs-gpost [options] <model-in> <feature-rspecifier> "
- "<gpost-rspecifier> <vecs-wspecifier>\n";
-
- ParseOptions po(usage);
- string spk2utt_rspecifier, spkvecs_rspecifier;
- BaseFloat min_count = 100;
- BaseFloat rand_prune = 1.0e-05;
-
- po.Register("spk2utt", &spk2utt_rspecifier,
- "File to read speaker to utterance-list map from.");
- po.Register("spkvec-min-count", &min_count,
- "Minimum count needed to estimate speaker vectors");
- po.Register("rand-prune", &rand_prune, "Randomized pruning parameter for posteriors (more->faster).");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)");
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- string model_rxfilename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- gpost_rspecifier = po.GetArg(3),
- vecs_wspecifier = po.GetArg(4);
-
- TransitionModel trans_model;
- AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_rxfilename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
- MleSgmmSpeakerAccs spk_stats(am_sgmm, rand_prune);
-
- RandomAccessSgmmGauPostReader gpost_reader(gpost_rspecifier);
-
- RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
- BaseFloatVectorWriter vecs_writer(vecs_wspecifier);
-
- double tot_impr = 0.0, tot_t = 0.0;
- int32 num_done = 0, num_no_gpost = 0, num_other_error = 0;
-
- if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation
- SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
- for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
- spk_stats.Clear();
- string spk = spk2utt_reader.Key();
- const vector<string> &uttlist = spk2utt_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(spk)) {
- spk_vars.v_s = spkvecs_reader.Value(spk);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << spk;
- }
- } // else spk_vars is "empty"
-
- for (size_t i = 0; i < uttlist.size(); i++) {
- std::string utt = uttlist[i];
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find features for utterance " << utt;
- continue;
- }
- if (!gpost_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posteriors for utterance " << utt;
- num_no_gpost++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
- const SgmmGauPost &gpost = gpost_reader.Value(utt);
- if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
- KALDI_WARN << "gpost vector has wrong size " << (gpost.size())
- << " vs. " << (feats.NumRows());
- num_other_error++;
- continue;
- }
-
- AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars, &spk_stats);
- num_done++;
- } // end looping over all utterances of the current speaker
-
- BaseFloat impr, spk_tot_t;
- { // Compute the spk_vec and write it out.
- Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
- if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s);
- spk_stats.Update(min_count, &spk_vec, &impr, &spk_tot_t);
- vecs_writer.Write(spk, spk_vec);
- }
- KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is "
- << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.\n";
- tot_impr += impr;
- tot_t += spk_tot_t;
- } // end looping over speakers
- } else { // per-utterance adaptation
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- if (!gpost_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posts for utterance "
- << utt;
- num_no_gpost++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- }
- } // else spk_vars is "empty"
- const SgmmGauPost &gpost = gpost_reader.Value(utt);
-
- if (static_cast<int32>(gpost.size()) != feats.NumRows()) {
- KALDI_WARN << "gpost has wrong size " << (gpost.size())
- << " vs. " << (feats.NumRows());
- num_other_error++;
- continue;
- }
- num_done++;
-
- spk_stats.Clear();
-
- AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars, &spk_stats);
-
- BaseFloat impr, utt_tot_t;
- { // Compute the spk_vec and write it out.
- Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
- if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s);
- spk_stats.Update(min_count, &spk_vec, &impr, &utt_tot_t);
- vecs_writer.Write(utt, spk_vec);
- }
- KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is "
- << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
- tot_impr += impr;
- tot_t += utt_tot_t;
- }
- }
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_gpost
- << " with no gposts, " << num_other_error << " with other errors.";
- KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t)
- << " over " << tot_t << " frames.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
diff --git a/src/sgmmbin/sgmm-est-spkvecs.cc b/src/sgmmbin/sgmm-est-spkvecs.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-// sgmmbin/sgmm-est-spkvecs.cc
-
-// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-// 2014 Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-#include <vector>
-using std::vector;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "hmm/posterior.h"
-
-namespace kaldi {
-
-void AccumulateForUtterance(const Matrix<BaseFloat> &feats,
- const Posterior &post,
- const TransitionModel &trans_model,
- const AmSgmm &am_sgmm,
- const SgmmGselectConfig &gselect_opts,
- const vector< vector<int32> > &gselect,
- const SgmmPerSpkDerivedVars &spk_vars,
- MleSgmmSpeakerAccs *spk_stats) {
- kaldi::SgmmPerFrameDerivedVars per_frame_vars;
-
- Posterior pdf_post;
- ConvertPosteriorToPdfs(trans_model, post, &pdf_post);
- for (size_t i = 0; i < post.size(); i++) {
- std::vector<int32> this_gselect;
- if (!gselect.empty())
- this_gselect = gselect[i];
- else
- am_sgmm.GaussianSelection(gselect_opts, feats.Row(i), &this_gselect);
- am_sgmm.ComputePerFrameVars(feats.Row(i), this_gselect, spk_vars, 0.0, &per_frame_vars);
-
- for (size_t j = 0; j < pdf_post[i].size(); j++) {
- int32 pdf_id = pdf_post[i][j].first;
- spk_stats->Accumulate(am_sgmm, per_frame_vars, pdf_id, pdf_post[i][j].second);
- }
- }
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
- using namespace kaldi;
- const char *usage =
- "Estimate SGMM speaker vectors, either per utterance or for the "
- "supplied set of speakers (with spk2utt option).\n"
- "Reads Gaussian-level posteriors. Writes to a table of vectors.\n"
- "Usage: sgmm-est-spkvecs [options] <model-in> <feature-rspecifier> "
- "<post-rspecifier> <vecs-wspecifier>\n";
-
- ParseOptions po(usage);
- string gselect_rspecifier, spk2utt_rspecifier, spkvecs_rspecifier;
- BaseFloat min_count = 100;
- BaseFloat rand_prune = 1.0e-05;
- SgmmGselectConfig gselect_opts;
-
- gselect_opts.Register(&po);
- po.Register("gselect", &gselect_rspecifier,
- "File to read precomputed per-frame Gaussian indices from.");
- po.Register("spk2utt", &spk2utt_rspecifier,
- "File to read speaker to utterance-list map from.");
- po.Register("spkvec-min-count", &min_count,
- "Minimum count needed to estimate speaker vectors");
- po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)");
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- string model_rxfilename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- post_rspecifier = po.GetArg(3),
- vecs_wspecifier = po.GetArg(4);
-
- TransitionModel trans_model;
- AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_rxfilename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
- MleSgmmSpeakerAccs spk_stats(am_sgmm, rand_prune);
-
- RandomAccessPosteriorReader post_reader(post_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
- RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier);
-
- BaseFloatVectorWriter vecs_writer(vecs_wspecifier);
-
- double tot_impr = 0.0, tot_t = 0.0;
- int32 num_done = 0, num_no_post = 0, num_other_error = 0;
- std::vector<std::vector<int32> > empty_gselect;
-
- if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation
- SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
- for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
- spk_stats.Clear();
- string spk = spk2utt_reader.Key();
- const vector<string> &uttlist = spk2utt_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(spk)) {
- spk_vars.v_s = spkvecs_reader.Value(spk);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << spk;
- }
- } // else spk_vars is "empty"
-
- for (size_t i = 0; i < uttlist.size(); i++) {
- std::string utt = uttlist[i];
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find features for utterance " << utt;
- continue;
- }
- if (!post_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posteriors for utterance " << utt;
- num_no_post++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
- const Posterior &post = post_reader.Value(utt);
- if (static_cast<int32>(post.size()) != feats.NumRows()) {
- KALDI_WARN << "Posterior vector has wrong size " << (post.size())
- << " vs. " << (feats.NumRows());
- num_other_error++;
- continue;
- }
- bool has_gselect = false;
- if (gselect_reader.IsOpen()) {
- has_gselect = gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == feats.NumRows();
- if (!has_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- }
- const std::vector<std::vector<int32> > *gselect =
- (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- AccumulateForUtterance(feats, post, trans_model, am_sgmm, gselect_opts, *gselect, spk_vars, &spk_stats);
- num_done++;
- } // end looping over all utterances of the current speaker
-
- BaseFloat impr, spk_tot_t;
- { // Compute the spk_vec and write it out.
- Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
- if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s);
- spk_stats.Update(min_count, &spk_vec, &impr, &spk_tot_t);
- vecs_writer.Write(spk, spk_vec);
- }
- KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is "
- << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.";
- tot_impr += impr;
- tot_t += spk_tot_t;
- } // end looping over speakers
- } else { // per-utterance adaptation
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- if (!post_reader.HasKey(utt)) {
- KALDI_WARN << "Did not find posts for utterance "
- << utt;
- num_no_post++;
- continue;
- }
- const Matrix<BaseFloat> &feats = feature_reader.Value();
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- }
- } // else spk_vars is "empty"
- const Posterior &post = post_reader.Value(utt);
-
- if (static_cast<int32>(post.size()) != feats.NumRows()) {
- KALDI_WARN << "Posterior has wrong size " << (post.size())
- << " vs. " << (feats.NumRows());
- num_other_error++;
- continue;
- }
- num_done++;
-
- spk_stats.Clear();
- bool has_gselect = false;
- if (gselect_reader.IsOpen()) {
- has_gselect = gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == feats.NumRows();
- if (!has_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- }
- const std::vector<std::vector<int32> > *gselect =
- (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- AccumulateForUtterance(feats, post, trans_model, am_sgmm, gselect_opts, *gselect, spk_vars, &spk_stats);
-
- BaseFloat impr, utt_tot_t;
- { // Compute the spk_vec and write it out.
- Vector<BaseFloat> spk_vec(am_sgmm.SpkSpaceDim(), kSetZero);
- if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s);
- spk_stats.Update(min_count, &spk_vec, &impr, &utt_tot_t);
- vecs_writer.Write(utt, spk_vec);
- }
- KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is "
- << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames.";
- tot_impr += impr;
- tot_t += utt_tot_t;
- }
- }
-
- KALDI_LOG << "Overall auxf impr per frame is "
- << (tot_impr / tot_t) << " over " << tot_t << " frames.";
- KALDI_LOG << "Done " << num_done << " files, " << num_no_post
- << " with no posts, " << num_other_error << " with other errors.";
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
diff --git a/src/sgmmbin/sgmm-est.cc b/src/sgmmbin/sgmm-est.cc
--- a/src/sgmmbin/sgmm-est.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// sgmmbin/sgmm-est.cc
-
-// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "thread/kaldi-thread.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- const char *usage =
- "Estimate SGMM model parameters from accumulated stats.\n"
- "Usage: sgmm-est [options] <model-in> <stats-in> <model-out>\n";
-
- bool binary_write = true;
- std::string update_flags_str = "vMNwcSt";
- std::string write_flags_str = "gsnu";
- kaldi::MleTransitionUpdateConfig tcfg;
- kaldi::MleAmSgmmOptions sgmm_opts;
- int32 split_substates = 0;
- int32 increase_phn_dim = 0;
- int32 increase_spk_dim = 0;
- bool remove_speaker_space = false;
- BaseFloat perturb_factor = 0.01;
- BaseFloat power = 0.2;
- BaseFloat max_cond = 100;
- std::string occs_out_filename;
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
- po.Register("split-substates", &split_substates, "Increase number of "
- "substates to this overall target.");
- po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space "
- "dimension as far as allowed towards this target.");
- po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space "
- "dimension as far as allowed towards this target.");
- po.Register("remove-speaker-space", &remove_speaker_space, "Remove speaker-specific "
- "projections N");
- po.Register("power", &power, "Exponent for substate occupancies used while "
- "splitting substates.");
- po.Register("perturb-factor", &perturb_factor, "Perturbation factor for "
- "state vectors while splitting substates.");
- po.Register("max-cond-split", &max_cond, "Max condition number of smoothing "
- "matrix used in substate splitting.");
- po.Register("write-occs", &occs_out_filename, "File to write pdf "
- "occupantion counts to.");
- po.Register("update-flags", &update_flags_str, "Which SGMM parameters to "
- "update: subset of vMNwcSt.");
- po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
- "write: subset of gsnu");
- po.Register("num-threads", &g_num_threads, "Number of threads to use in "
- "weight update and normalizer computation");
- tcfg.Register(&po);
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
- if (po.NumArgs() != 3) {
- po.PrintUsage();
- exit(1);
- }
- std::string model_in_filename = po.GetArg(1),
- stats_filename = po.GetArg(2),
- model_out_filename = po.GetArg(3);
-
- kaldi::SgmmUpdateFlagsType update_flags =
- StringToSgmmUpdateFlags(update_flags_str);
- kaldi::SgmmWriteFlagsType write_flags =
- StringToSgmmWriteFlags(write_flags_str);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- Vector<double> transition_accs;
- MleAmSgmmAccs sgmm_accs;
- {
- bool binary;
- Input ki(stats_filename, &binary);
- transition_accs.Read(ki.Stream(), binary);
- sgmm_accs.Read(ki.Stream(), binary, true); // true == add; doesn't matter here.
- }
-
- if (update_flags & kSgmmTransitions) { // Update transition model.
- BaseFloat objf_impr, count;
- trans_model.MleUpdate(transition_accs, tcfg, &objf_impr, &count);
- KALDI_LOG << "Transition model update: Overall " << (objf_impr/count)
- << " log-like improvement per frame over " << (count)
- << " frames.";
- }
-
- sgmm_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics.
-
- { // Do the update.
- kaldi::MleAmSgmmUpdater updater(sgmm_opts);
- updater.Update(sgmm_accs, &am_sgmm, update_flags);
- }
-
- if (split_substates != 0 || !occs_out_filename.empty()) { // get state occs
- Vector<BaseFloat> pdf_occs;
- sgmm_accs.GetStateOccupancies(&pdf_occs);
-
- if (split_substates != 0) {
- am_sgmm.SplitSubstates(pdf_occs, split_substates, perturb_factor,
- power, max_cond);
- am_sgmm.ComputeDerivedVars(); // recompute normalizers...
- }
-
- if (!occs_out_filename.empty()) {
- bool binary_write = false;
- kaldi::Output ko(occs_out_filename, binary_write);
- pdf_occs.Write(ko.Stream(), binary_write);
- }
- }
-
- if (increase_phn_dim != 0 || increase_spk_dim != 0) {
- // Feature normalizing transform matrix used to initialize the new columns
- // of the phonetic- or speaker-space projection matrices.
- kaldi::Matrix<BaseFloat> norm_xform;
- ComputeFeatureNormalizer(am_sgmm.full_ubm(), &norm_xform);
- if (increase_phn_dim != 0)
- am_sgmm.IncreasePhoneSpaceDim(increase_phn_dim, norm_xform);
- if (increase_spk_dim != 0)
- am_sgmm.IncreaseSpkSpaceDim(increase_spk_dim, norm_xform);
- }
- if (remove_speaker_space) {
- KALDI_LOG << "Removing speaker space (projections N_)";
- am_sgmm.RemoveSpeakerSpace();
- }
-
- {
- Output ko(model_out_filename, binary_write);
- trans_model.Write(ko.Stream(), binary_write);
- am_sgmm.Write(ko.Stream(), binary_write, write_flags);
- }
-
-
- KALDI_LOG << "Written model to " << model_out_filename;
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-gselect.cc b/src/sgmmbin/sgmm-gselect.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// sgmmbin/sgmm-gselect.cc
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- const char *usage =
- "Precompute Gaussian indices for SGMM training "
- "Usage: sgmm-gselect [options] <model-in> <feature-rspecifier> <gselect-wspecifier>\n"
- "e.g.: sgmm-gselect 1.sgmm \"ark:feature-command |\" ark:1.gs\n"
- "Note: you can do the same thing by combining the programs sgmm-write-ubm, fgmm-global-to-gmm,\n"
- "gmm-gselect and fgmm-gselect\n";
-
- ParseOptions po(usage);
- kaldi::SgmmGselectConfig sgmm_opts;
- std::string preselect_rspecifier;
- std::string likelihood_wspecifier;
- po.Register("preselect", &preselect_rspecifier, "Rspecifier for sets of Gaussians to "
- "limit gselect to (e.g. for gender dependent systems)");
- po.Register("write-likes", &likelihood_wspecifier, "Wspecifier for likelihoods per "
- "utterance");
- sgmm_opts.Register(&po);
- po.Read(argc, argv);
-
- if (po.NumArgs() != 3) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- gselect_wspecifier = po.GetArg(3);
-
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_filename, &binary);
- TransitionModel trans_model;
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- double tot_like = 0.0;
- kaldi::int64 tot_t = 0;
-
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- Int32VectorVectorWriter gselect_writer(gselect_wspecifier);
- BaseFloatWriter likelihood_writer(likelihood_wspecifier);
- RandomAccessInt32VectorReader preselect_reader(preselect_rspecifier);
-
- int32 num_done = 0, num_err = 0;
- for (; !feature_reader.Done(); feature_reader.Next()) {
- int32 tot_t_this_file = 0; double tot_like_this_file = 0;
- std::string utt = feature_reader.Key();
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- std::vector<std::vector<int32> > gselect_vec(mat.NumRows());
- tot_t_this_file += mat.NumRows();
- if(preselect_rspecifier != "") { // e.g. gender dependent.
- if (!preselect_reader.HasKey(utt)) {
- KALDI_WARN << "No preselect information for utterance " << utt;
- num_err++;
- continue;
- }
- const std::vector<int32> &preselect = preselect_reader.Value(utt);
- KALDI_ASSERT(!preselect.empty());
- for (int32 i = 0; i < mat.NumRows(); i++)
- tot_like_this_file +=
- am_sgmm.GaussianSelectionPreselect(sgmm_opts, mat.Row(i),
- preselect, &(gselect_vec[i]));
- } else {
- for (int32 i = 0; i < mat.NumRows(); i++)
- tot_like_this_file += am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &(gselect_vec[i]));
- }
- gselect_writer.Write(utt, gselect_vec);
- if (num_done % 10 == 0)
- KALDI_LOG << "For " << num_done << "'th file, average UBM likelihood over "
- << tot_t_this_file << " frames is "
- << (tot_like_this_file/tot_t_this_file);
- tot_t += tot_t_this_file;
- tot_like += tot_like_this_file;
-
- if(likelihood_wspecifier != "")
- likelihood_writer.Write(utt, tot_like_this_file);
- num_done++;
- }
-
- KALDI_LOG << "Done " << num_done << " files, " << num_err
- << " with errors, average UBM log-likelihood is "
- << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
-
- if (num_done != 0) return 0;
- else return 1;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-info.cc b/src/sgmmbin/sgmm-info.cc
--- a/src/sgmmbin/sgmm-info.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// sgmmbin/sgmm-info.cc
-
-// Copyright 2012 Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iomanip>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- const char *usage =
- "Print various information about an SGMM.\n"
- "Usage: sgmm-info [options] <model-in> [model-in2 ... ]\n";
-
- bool sgmm_detailed = false;
- bool trans_detailed = false;
-
- ParseOptions po(usage);
- po.Register("sgmm-detailed", &sgmm_detailed,
- "Print detailed information about substates.");
- po.Register("trans-detailed", &trans_detailed,
- "Print detailed information about transition model.");
-
- po.Read(argc, argv);
- if (po.NumArgs() < 1) {
- po.PrintUsage();
- exit(1);
- }
-
- for (int i = 1, max = po.NumArgs(); i <= max; ++i) {
- std::string model_in_filename = po.GetArg(i);
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- {
- using namespace std;
- cout.setf(ios::left);
- cout << "\nModel file: " << model_in_filename << endl;
- cout << " SGMM information:\n"
- << setw(40) << " # of HMM states" << am_sgmm.NumPdfs() << endl
- << setw(40) << " # of Gaussians per state" << am_sgmm.NumGauss() << endl
- << setw(40) << " Dimension of phone vector space"
- << am_sgmm.PhoneSpaceDim() << endl
- << setw(40) << " Dimension of speaker vector space"
- << am_sgmm.SpkSpaceDim() << endl
- << setw(40) << " Dimension of feature vectors"
- << am_sgmm.FeatureDim() << endl;
- int32 total_substates = 0;
- for (int32 j = 0; j < am_sgmm.NumPdfs(); j++) {
- total_substates += am_sgmm.NumSubstates(j);
- if (sgmm_detailed) {
- cout << " # of substates for state " << setw(13) << j
- << am_sgmm.NumSubstates(j) << endl;
- }
- }
- cout << setw(40) << " Total # of substates " << total_substates << endl;
-
- cout << "\nTransition model information:\n"
- << setw(40) << " # of HMM states" << trans_model.NumPdfs() << endl
- << setw(40) << " # of transition states"
- << trans_model.NumTransitionStates() << endl;
- int32 total_indices = 0;
- for (int32 s = 0; s < trans_model.NumTransitionStates(); s++) {
- total_indices += trans_model.NumTransitionIndices(s);
- if (trans_detailed) {
- cout << " # of transition ids for state " << setw(8) << s
- << trans_model.NumTransitionIndices(s) << endl;
- }
- }
- cout << setw(40) << " Total # of transition ids " << total_indices
- << endl;
- }
- }
-
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-init-from-tree-stats.cc b/src/sgmmbin/sgmm-init-from-tree-stats.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// sgmmbin/sgmm-init-from-tree-stats.cc
-
-// Copyright 2012 Arnab Ghoshal Johns Hopkins University (Author: Daniel Povey)
-// Copyright 2009-2011 Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "sgmm/am-sgmm.h"
-#include "sgmm/sgmm-clusterable.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-#include "tree/build-tree-utils.h"
-
-
-
-namespace kaldi {
-void InitAndOutputSgmm(const HmmTopology &topo,
- const AmSgmm &am_sgmm,
- const ContextDependency &ctx_dep,
- const std::vector<SpMatrix<double> > &H,
- const BuildTreeStatsType &stats,
- const std::string &sgmm_wxfilename,
- bool binary) {
- int32 num_pdfs = ctx_dep.NumPdfs();
- AmSgmm am_sgmm_out;
- am_sgmm_out.CopyGlobalsInitVecs(am_sgmm, am_sgmm.PhoneSpaceDim(),
- am_sgmm.SpkSpaceDim(), num_pdfs);
- MleAmSgmmOptions opts; // Use default options; we can change this later
- // if we need to use any non-default options.
- MleAmSgmmUpdater updater(opts);
-
- std::vector<BuildTreeStatsType> split_stats;
- SplitStatsByMap(stats, ctx_dep.ToPdfMap(), &split_stats);
- // Make sure each leaf has stats.
- for (size_t i = 0; i < split_stats.size(); i++)
- KALDI_ASSERT(! split_stats[i].empty() && "Tree has leaves with no stats."
- " Modify your roots file as necessary to fix this.");
- std::vector<Clusterable*> summed_stats;
- SumStatsVec(split_stats, &summed_stats);
-
- std::vector<SgmmClusterable*> &summed_sgmm_stats =
- *(reinterpret_cast<std::vector<SgmmClusterable*>*> (&summed_stats));
-
- for (int32 iter = 0; iter < 5; iter++) { // Update for
- // several iterations; we're starting from zero so we won't
- // converge exactly on the first iteration.
- updater.UpdatePhoneVectorsCheckedFromClusterable(summed_sgmm_stats,
- H,
- &am_sgmm_out);
- }
- DeletePointers(&summed_stats);
-
- TransitionModel trans_model_out(ctx_dep, topo);
- {
- Output ko(sgmm_wxfilename, binary);
- am_sgmm_out.ComputeNormalizers();
- trans_model_out.Write(ko.Stream(), binary);
- am_sgmm_out.Write(ko.Stream(), binary, kSgmmWriteAll);
- }
-}
-
-}
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Initialize an SGMM from a previously built SGMM, a tree, \n"
- "and SGMM-type tree stats\n"
- "Usage: sgmm-init-from-tree-stats [options] <old-sgmm> <tree> <sgmm-tree-stats> <sgmm-out>\n";
-
- bool binary = true;
- kaldi::ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode");
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string sgmm_in_filename = po.GetArg(1),
- tree_in_filename = po.GetArg(2),
- tree_stats_filename = po.GetArg(3),
- sgmm_out_filename = po.GetArg(4);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(sgmm_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- const HmmTopology &topo = trans_model.GetTopo();
- std::vector<SpMatrix<double> > H;
- am_sgmm.ComputeH(&H);
-
- ContextDependency ctx_dep;
- {
- bool binary_in;
- Input ki(tree_in_filename.c_str(), &binary_in);
- ctx_dep.Read(ki.Stream(), binary_in);
- }
-
- BuildTreeStatsType stats;
- {
- bool binary_in;
- SgmmClusterable sc(am_sgmm, H); // dummy stats needed to provide
- // type info, and access to am_sgmm and H.
- Input ki(tree_stats_filename, &binary_in);
- ReadBuildTreeStats(ki.Stream(), binary_in, sc, &stats);
- }
- KALDI_LOG << "Number of separate statistics is " << stats.size();
-
- InitAndOutputSgmm(topo, am_sgmm, ctx_dep, H, stats,
- sgmm_out_filename, binary);
-
- KALDI_LOG << "Written model to " << sgmm_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-init.cc b/src/sgmmbin/sgmm-init.cc
--- a/src/sgmmbin/sgmm-init.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// sgmmbin/sgmm-init.cc
-
-// Copyright 2012 Arnab Ghoshal
-// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "gmm/am-diag-gmm.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "tree/context-dep.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Initialize an SGMM from a trained full-covariance UBM and a specified"
- " model topology.\n"
- "Usage: sgmm-init [options] <topology> <tree> <init-model> <sgmm-out>\n"
- "The <init-model> argument can be a UBM (the default case) or another\n"
- "SGMM (if the --init-from-sgmm flag is used).\n";
-
- bool binary = true, init_from_sgmm = false;
- int32 phn_space_dim = 0, spk_space_dim = 0;
- kaldi::ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode");
- po.Register("phn-space-dim", &phn_space_dim, "Phonetic space dimension.");
- po.Register("spk-space-dim", &spk_space_dim, "Speaker space dimension.");
- po.Register("init-from-sgmm", &init_from_sgmm,
- "Initialize from another SGMM (instead of a UBM).");
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string topo_in_filename = po.GetArg(1),
- tree_in_filename = po.GetArg(2),
- init_model_filename = po.GetArg(3),
- sgmm_out_filename = po.GetArg(4);
-
- ContextDependency ctx_dep;
- {
- bool binary_in;
- Input ki(tree_in_filename.c_str(), &binary_in);
- ctx_dep.Read(ki.Stream(), binary_in);
- }
-
-
- HmmTopology topo;
- ReadKaldiObject(topo_in_filename, &topo);
-
- TransitionModel trans_model(ctx_dep, topo);
-
- kaldi::AmSgmm sgmm;
- if (init_from_sgmm) {
- kaldi::AmSgmm init_sgmm;
- {
- bool binary_read;
- TransitionModel tmp_trans;
- kaldi::Input ki(init_model_filename, &binary_read);
- tmp_trans.Read(ki.Stream(), binary_read);
- init_sgmm.Read(ki.Stream(), binary_read);
- }
- sgmm.CopyGlobalsInitVecs(init_sgmm, phn_space_dim, spk_space_dim,
- trans_model.NumPdfs());
- } else {
- kaldi::FullGmm ubm;
- {
- bool binary_read;
- kaldi::Input ki(init_model_filename, &binary_read);
- ubm.Read(ki.Stream(), binary_read);
- }
- sgmm.InitializeFromFullGmm(ubm, trans_model.NumPdfs(), phn_space_dim,
- spk_space_dim);
- }
- sgmm.ComputeNormalizers();
-
- {
- kaldi::Output ko(sgmm_out_filename, binary);
- trans_model.Write(ko.Stream(), binary);
- sgmm.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll);
- }
-
- KALDI_LOG << "Written model to " << sgmm_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-latgen-faster.cc b/src/sgmmbin/sgmm-latgen-faster.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// sgmmbin/sgmm-latgen-faster.cc
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation;
-// Johns Hopkins University (author: Daniel Povey)
-// 2014 Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "sgmm/decodable-am-sgmm.h"
-#include "base/timer.h"
-
-namespace kaldi {
-
-// the reference arguments at the beginning are not const as the style guide
-// requires, but are best viewed as inputs.
-bool ProcessUtterance(LatticeFasterDecoder &decoder,
- const AmSgmm &am_sgmm,
- const TransitionModel &trans_model,
- const SgmmGselectConfig &sgmm_opts,
- double log_prune,
- double acoustic_scale,
- const Matrix<BaseFloat> &features,
- RandomAccessInt32VectorVectorReader &gselect_reader,
- RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader,
- const fst::SymbolTable *word_syms,
- const std::string &utt,
- bool determinize,
- bool allow_partial,
- Int32VectorWriter *alignments_writer,
- Int32VectorWriter *words_writer,
- CompactLatticeWriter *compact_lattice_writer,
- LatticeWriter *lattice_writer,
- double *like_ptr) { // puts utterance's like in like_ptr on success.
- using fst::VectorFst;
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance";
- return false; // We could use zero, but probably the user would want to know about this
- // (this would normally be a script error or some kind of failure).
- }
- }
- bool has_gselect = false;
- if (gselect_reader.IsOpen()) {
- has_gselect = gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == features.NumRows();
- if (!has_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- }
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
- DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars,
- trans_model, features, *gselect,
- log_prune, acoustic_scale);
-
- return DecodeUtteranceLatticeFaster(
- decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale,
- determinize, allow_partial, alignments_writer, words_writer,
- compact_lattice_writer, lattice_writer, like_ptr);
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- using fst::SymbolTable;
- using fst::VectorFst;
- using fst::StdArc;
-
- const char *usage =
- "Decode features using SGMM-based model.\n"
- "Usage: sgmm-latgen-faster [options] <model-in> (<fst-in>|<fsts-rspecifier>) "
- "<features-rspecifier> <lattices-wspecifier> [<words-wspecifier> [<alignments-wspecifier>] ]\n";
- ParseOptions po(usage);
- BaseFloat acoustic_scale = 0.1;
- bool allow_partial = false;
- BaseFloat log_prune = 5.0;
- string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
- utt2spk_rspecifier;
-
- LatticeFasterDecoderConfig decoder_opts;
- SgmmGselectConfig sgmm_opts;
- decoder_opts.Register(&po);
- sgmm_opts.Register(&po);
-
- po.Register("acoustic-scale", &acoustic_scale,
- "Scaling factor for acoustic likelihoods");
- po.Register("log-prune", &log_prune,
- "Pruning beam used to reduce number of exp() evaluations.");
- po.Register("word-symbol-table", &word_syms_filename,
- "Symbol table for words [for debug output]");
- po.Register("allow-partial", &allow_partial,
- "Produce output even when final state was not reached");
- po.Register("gselect", &gselect_rspecifier,
- "rspecifier for precomputed per-frame Gaussian indices.");
- po.Register("spk-vecs", &spkvecs_rspecifier,
- "rspecifier for speaker vectors");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 4 || po.NumArgs() > 6) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_in_filename = po.GetArg(1),
- fst_in_str = po.GetArg(2),
- feature_rspecifier = po.GetArg(3),
- lattice_wspecifier = po.GetArg(4),
- words_wspecifier = po.GetOptArg(5),
- alignment_wspecifier = po.GetOptArg(6);
-
- TransitionModel trans_model;
- kaldi::AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- CompactLatticeWriter compact_lattice_writer;
- LatticeWriter lattice_writer;
- bool determinize = decoder_opts.determinize_lattice;
- if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
- : lattice_writer.Open(lattice_wspecifier)))
- KALDI_ERR << "Could not open table for writing lattices: "
- << lattice_wspecifier;
-
- Int32VectorWriter words_writer(words_wspecifier);
-
- Int32VectorWriter alignment_writer(alignment_wspecifier);
-
- fst::SymbolTable *word_syms = NULL;
- if (word_syms_filename != "")
- if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
- KALDI_ERR << "Could not read symbol table from file "
- << word_syms_filename;
-
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
-
- BaseFloat tot_like = 0.0;
- kaldi::int64 frame_count = 0;
- int num_success = 0, num_fail = 0;
-
- Timer timer;
-
- if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { // a single FST.
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- // It's important that we initialize decode_fst after feature_reader, as it
- // can prevent crashes on systems installed without enough virtual memory.
- // It has to do with what happens on UNIX systems if you call fork() on a
- // large process: the page-table entries are duplicated, which requires a
- // lot of virtual memory.
- VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_str);
- timer.Reset(); // exclude graph loading time.
-
- {
- LatticeFasterDecoder decoder(*decode_fst, decoder_opts);
-
- const std::vector<std::vector<int32> > empty_gselect;
-
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- const Matrix<BaseFloat> &features(feature_reader.Value());
- if (features.NumRows() == 0) {
- KALDI_WARN << "Zero-length utterance: " << utt;
- num_fail++;
- continue;
- }
- double like;
- if (ProcessUtterance(decoder, am_sgmm, trans_model, sgmm_opts, log_prune, acoustic_scale,
- features, gselect_reader, spkvecs_reader, word_syms,
- utt, determinize, allow_partial,
- &alignment_writer, &words_writer, &compact_lattice_writer,
- &lattice_writer, &like)) {
- tot_like += like;
- frame_count += features.NumRows();
- KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
- << (like / features.NumRows()) << " over "
- << features.NumRows() << " frames.";
- num_success++;
- } else { num_fail++; }
- }
- }
- delete decode_fst; // only safe to do this after decoder goes out of scope.
- } else { // We have different FSTs for different utterances.
- SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
- for (; !fst_reader.Done(); fst_reader.Next()) {
- std::string utt = fst_reader.Key();
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "Not decoding utterance " << utt
- << " because no features available.";
- num_fail++;
- continue;
- }
- const Matrix<BaseFloat> &features = feature_reader.Value(utt);
- if (features.NumRows() == 0) {
- KALDI_WARN << "Zero-length utterance: " << utt;
- num_fail++;
- continue;
- }
- LatticeFasterDecoder decoder(fst_reader.Value(), decoder_opts);
- double like;
- if (ProcessUtterance(decoder, am_sgmm, trans_model, sgmm_opts, log_prune, acoustic_scale,
- features, gselect_reader, spkvecs_reader, word_syms,
- utt, determinize, allow_partial,
- &alignment_writer, &words_writer, &compact_lattice_writer,
- &lattice_writer, &like)) {
- tot_like += like;
- frame_count += features.NumRows();
- KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
- << (like / features.NumRows()) << " over "
- << features.NumRows() << " frames.";
- num_success++;
- } else { num_fail++; }
- }
- }
- double elapsed = timer.Elapsed();
- KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
- << "s: real-time factor assuming 100 frames/sec is "
- << (elapsed*100.0/frame_count);
- KALDI_LOG << "Done " << num_success << " utterances, failed for "
- << num_fail;
- KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
- << " over " << frame_count << " frames.";
-
- delete word_syms;
- return (num_success != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-latgen-simple.cc b/src/sgmmbin/sgmm-latgen-simple.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-// sgmmbin/sgmm-latgen-simple.cc
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation
-// 2013 Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-using std::string;
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "sgmm/decodable-am-sgmm.h"
-#include "base/timer.h"
-
-namespace kaldi {
-
-// the reference arguments at the beginning are not const as the style guide
-// requires, but are best viewed as inputs.
-bool ProcessUtterance(LatticeSimpleDecoder &decoder,
- const AmSgmm &am_sgmm,
- const TransitionModel &trans_model,
- const SgmmGselectConfig &sgmm_opts,
- double log_prune,
- double acoustic_scale,
- const Matrix<BaseFloat> &features,
- RandomAccessInt32VectorVectorReader &gselect_reader,
- RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader,
- const fst::SymbolTable *word_syms,
- const std::string &utt,
- bool determinize,
- bool allow_partial,
- Int32VectorWriter *alignments_writer,
- Int32VectorWriter *words_writer,
- CompactLatticeWriter *compact_lattice_writer,
- LatticeWriter *lattice_writer,
- double *like_ptr) { // puts utterance's like in like_ptr on success.
- using fst::VectorFst;
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance";
- return false; // We could use zero, but probably the user would want to know about this
- // (this would normally be a script error or some kind of failure).
- }
- }
- bool has_gselect = false;
- if (gselect_reader.IsOpen()) {
- has_gselect = gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == features.NumRows();
- if (!has_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- }
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
- DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars,
- trans_model, features, *gselect,
- log_prune, acoustic_scale);
-
- return DecodeUtteranceLatticeSimple(
- decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale,
- determinize, allow_partial, alignments_writer, words_writer,
- compact_lattice_writer, lattice_writer, like_ptr);
-}
-
-} // end namespace kaldi
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- using fst::SymbolTable;
- using fst::VectorFst;
- using fst::StdArc;
-
- const char *usage =
- "Decode features using SGMM-based model.\n"
- "Usage: sgmm-latgen-simple [options] <model-in> <fst-in> "
- "<features-rspecifier> <lattices-wspecifier> [<words-wspecifier> [<alignments-wspecifier>] ]\n";
- ParseOptions po(usage);
- BaseFloat acoustic_scale = 0.1;
- bool allow_partial = false;
- BaseFloat log_prune = 5.0;
- string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier,
- utt2spk_rspecifier;
-
- LatticeSimpleDecoderConfig decoder_opts;
- SgmmGselectConfig sgmm_opts;
- decoder_opts.Register(&po);
- sgmm_opts.Register(&po);
-
- po.Register("acoustic-scale", &acoustic_scale,
- "Scaling factor for acoustic likelihoods");
- po.Register("log-prune", &log_prune,
- "Pruning beam used to reduce number of exp() evaluations.");
- po.Register("word-symbol-table", &word_syms_filename,
- "Symbol table for words [for debug output]");
- po.Register("allow-partial", &allow_partial,
- "Produce output even when final state was not reached");
- po.Register("gselect", &gselect_rspecifier,
- "rspecifier for precomputed per-frame Gaussian indices.");
- po.Register("spk-vecs", &spkvecs_rspecifier,
- "rspecifier for speaker vectors");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 4 || po.NumArgs() > 6) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_in_filename = po.GetArg(1),
- fst_in_filename = po.GetArg(2),
- feature_rspecifier = po.GetArg(3),
- lattice_wspecifier = po.GetArg(4),
- words_wspecifier = po.GetOptArg(5),
- alignment_wspecifier = po.GetOptArg(6);
-
- TransitionModel trans_model;
- kaldi::AmSgmm am_sgmm;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- CompactLatticeWriter compact_lattice_writer;
- LatticeWriter lattice_writer;
- bool determinize = decoder_opts.determinize_lattice;
- if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
- : lattice_writer.Open(lattice_wspecifier)))
- KALDI_ERR << "Could not open table for writing lattices: "
- << lattice_wspecifier;
-
- Int32VectorWriter words_writer(words_wspecifier);
-
- Int32VectorWriter alignment_writer(alignment_wspecifier);
-
- fst::SymbolTable *word_syms = NULL;
- if (word_syms_filename != "")
- if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
- KALDI_ERR << "Could not read symbol table from file "
- << word_syms_filename;
-
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
-
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-
- // It's important that we initialize decode_fst after feature_reader, as it
- // can prevent crashes on systems installed without enough virtual memory.
- // It has to do with what happens on UNIX systems if you call fork() on a
- // large process: the page-table entries are duplicated, which requires a
- // lot of virtual memory.
- VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_filename);
-
- BaseFloat tot_like = 0.0;
- kaldi::int64 frame_count = 0;
- int num_success = 0, num_fail = 0;
- LatticeSimpleDecoder decoder(*decode_fst, decoder_opts);
-
- Timer timer;
-
- for (; !feature_reader.Done(); feature_reader.Next()) {
- string utt = feature_reader.Key();
- Matrix<BaseFloat> features(feature_reader.Value());
- feature_reader.FreeCurrent();
- if (features.NumRows() == 0) {
- KALDI_WARN << "Zero-length utterance: " << utt;
- num_fail++;
- continue;
- }
- double like;
- if (ProcessUtterance(decoder, am_sgmm, trans_model, sgmm_opts, log_prune,
- acoustic_scale, features, gselect_reader,
- spkvecs_reader, word_syms, utt, determinize,
- allow_partial, &alignment_writer, &words_writer,
- &compact_lattice_writer, &lattice_writer, &like)) {
- tot_like += like;
- frame_count += features.NumRows();
- KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
- << (like / features.NumRows()) << " over "
- << features.NumRows() << " frames.";
- num_success++;
- } else num_fail++;
- }
- double elapsed = timer.Elapsed();
- KALDI_LOG << "Time taken [excluding initialization] "<< elapsed
- << "s: real-time factor assuming 100 frames/sec is "
- << (elapsed*100.0/frame_count);
- KALDI_LOG << "Done " << num_success << " utterances, failed for "
- << num_fail;
- KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
- << " over " << frame_count << " frames.";
-
- delete word_syms;
- delete decode_fst;
- return (num_success != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-mixup.cc b/src/sgmmbin/sgmm-mixup.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// sgmmbin/sgmm-mixup.cc
-
-// Copyright 2009-2011 Saarland University
-// Author: Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- const char *usage =
- "Increase number of sub-states or dimensions in SGMM\n"
- "Usage: sgmm-mixup [options] <model-in> <model-out>\n"
- "E.g. of mixing up:\n"
- " sgmm-mixup --read-occs=1.occs --num-substates=10000 1.mdl 2.mdl\n"
- "E.g. of increasing phonetic dim:\n"
- " sgmm-mixup --increase-phn-dim=50 1.mdl 2.mdl\n"
- "E.g. of increasing speaker dim:\n"
- " sgmm-mixup --increase-spk-dim=50 1.mdl 2.mdl\n"
- "E.g. of removing speaker space:\n"
- " sgmm-mixup --remove-speaker-space 1.mdl 2.mdl\n"
- "These modes may be combined.\n";
-
- bool binary_write = true;
- std::string write_flags_str = "gsnu";
- int32 split_substates = 0;
- int32 increase_phn_dim = 0;
- int32 increase_spk_dim = 0;
- bool remove_speaker_space = false;
- BaseFloat perturb_factor = 0.01;
- BaseFloat power = 0.2;
- BaseFloat max_cond = 100;
- std::string occs_in_filename;
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
- po.Register("split-substates", &split_substates, "Increase number of "
- "substates to this overall target.");
- po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space "
- "dimension as far as allowed towards this target.");
- po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space "
- "dimension as far as allowed towards this target.");
- po.Register("remove-speaker-space", &remove_speaker_space, "Remove speaker-specific "
- "projections N");
- po.Register("power", &power, "Exponent for substate occupancies used while "
- "splitting substates.");
- po.Register("perturb-factor", &perturb_factor, "Perturbation factor for "
- "state vectors while splitting substates.");
- po.Register("max-cond-split", &max_cond, "Max condition number of smoothing "
- "matrix used in substate splitting.");
- po.Register("write-flags", &write_flags_str, "Which SGMM parameters to "
- "write: subset of gsnu");
- po.Register("read-occs", &occs_in_filename, "Read occupancies from this file "
- "(required for mixing up)");
-
- po.Read(argc, argv);
- if (po.NumArgs() != 2) {
- po.PrintUsage();
- exit(1);
- }
- std::string model_in_filename = po.GetArg(1),
- model_out_filename = po.GetArg(2);
-
- kaldi::SgmmWriteFlagsType write_flags =
- StringToSgmmWriteFlags(write_flags_str);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- if (split_substates != 0) {
- if (occs_in_filename.empty())
- KALDI_ERR << "The --split-substates option requires the --read-occs option";
-
- Vector<BaseFloat> state_occs;
- {
- bool binary_in;
- kaldi::Input ki(occs_in_filename, &binary_in);
- state_occs.Read(ki.Stream(), binary_in);
- }
-
- am_sgmm.SplitSubstates(state_occs, split_substates, perturb_factor,
- power, max_cond);
- am_sgmm.ComputeDerivedVars(); // recompute normalizers...
- }
-
- if (increase_phn_dim != 0 || increase_spk_dim != 0) {
- // Feature normalizing transform matrix used to initialize the new columns
- // of the phonetic- or speaker-space projection matrices.
- kaldi::Matrix<BaseFloat> norm_xform;
- ComputeFeatureNormalizer(am_sgmm.full_ubm(), &norm_xform);
- if (increase_phn_dim != 0)
- am_sgmm.IncreasePhoneSpaceDim(increase_phn_dim, norm_xform);
- if (increase_spk_dim != 0)
- am_sgmm.IncreaseSpkSpaceDim(increase_spk_dim, norm_xform);
- }
-
- if (remove_speaker_space) {
- KALDI_LOG << "Removing speaker space (projections N_)";
- am_sgmm.RemoveSpeakerSpace();
- }
-
- {
- Output ko(model_out_filename, binary_write);
- trans_model.Write(ko.Stream(), binary_write);
- am_sgmm.Write(ko.Stream(), binary_write, write_flags);
- }
-
- KALDI_LOG << "Written model to " << model_out_filename;
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-normalize.cc b/src/sgmmbin/sgmm-normalize.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-// sgmmbin/sgmm-normalize.cc
-
-// Copyright 2009-2011 Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- const char *usage =
- "Renormalize SGMM so that within certain subsets of UBM Gaussians (typically \n"
- "corresponding to gender), probabilities sum to one; write it out, including\n"
- "normalizers."
- "Note: gaussians-rspecifier will normally be \"ark:foo\" where foo looks like\n"
- " m 0 1 2 3 4 5\n"
- " f 6 7 8 9 10\n"
- "Usage: sgmm-normalize [options] <model-in> <gaussians-rspecifier> <model-out>\n";
-
- bool binary_write = true;
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
-
- po.Read(argc, argv);
- if (po.NumArgs() != 3) {
- po.PrintUsage();
- exit(1);
- }
- std::string model_in_filename = po.GetArg(1),
- gaussians_rspecifier = po.GetArg(2),
- model_out_filename = po.GetArg(3);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- std::vector<std::vector<int32> > norm_sets;
- SequentialInt32VectorReader vec_reader(gaussians_rspecifier);
- for (;!vec_reader.Done(); vec_reader.Next())
- norm_sets.push_back(vec_reader.Value());
-
- am_sgmm.ComputeNormalizersNormalized(norm_sets);
-
- {
- Output ko(model_out_filename, binary_write);
- trans_model.Write(ko.Stream(), binary_write);
- am_sgmm.Write(ko.Stream(), binary_write, kSgmmWriteAll);
- }
-
-
- KALDI_LOG << "Written model to " << model_out_filename;
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-post-to-gpost.cc b/src/sgmmbin/sgmm-post-to-gpost.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// sgmmbin/sgmm-post-to-gpost.cc
-
-// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
-// 2014 Guoguo Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/posterior.h"
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- try {
- const char *usage =
- "Convert posteriors to Gaussian-level posteriors for SGMM training.\n"
- "Usage: sgmm-post-to-gpost [options] <model-in> <feature-rspecifier> "
- "<posteriors-rspecifier> <gpost-wspecifier>\n"
- "e.g.: sgmm-post-to-gpost 1.mdl 1.ali scp:train.scp 'ark:ali-to-post ark:1.ali ark:-|' ark:-";
-
- ParseOptions po(usage);
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- SgmmGselectConfig sgmm_opts;
- po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- sgmm_opts.Register(&po);
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- feature_rspecifier = po.GetArg(2),
- posteriors_rspecifier = po.GetArg(3),
- gpost_wspecifier = po.GetArg(4);
-
- using namespace kaldi;
- typedef kaldi::int32 int32;
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- double tot_like = 0.0;
- kaldi::int64 tot_t = 0;
-
- SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
- RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
-
- SgmmPerFrameDerivedVars per_frame_vars;
-
- SgmmGauPostWriter gpost_writer(gpost_wspecifier);
-
- int32 num_done = 0, num_no_posterior = 0, num_other_error = 0;
- for (; !feature_reader.Done(); feature_reader.Next()) {
- std::string utt = feature_reader.Key();
- if (!posteriors_reader.HasKey(utt)) {
- num_no_posterior++;
- } else {
- const Matrix<BaseFloat> &mat = feature_reader.Value();
- Posterior posterior = posteriors_reader.Value(utt);
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == mat.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- if (posterior.size() != mat.NumRows()) {
- KALDI_WARN << "Alignments has wrong size "<< (posterior.size()) <<
- " vs. "<< (mat.NumRows());
- num_other_error++;
- continue;
- }
-
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_other_error++;
- continue;
- }
- } // else spk_vars is "empty"
-
- num_done++;
- BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0;
-
- SgmmGauPost gpost(posterior.size()); // posterior.size() == T.
-
- SortPosteriorByPdfs(trans_model, &posterior);
- int32 prev_pdf_id = -1;
- BaseFloat prev_like = 0;
- Matrix<BaseFloat> prev_posterior;
- for (size_t i = 0; i < posterior.size(); i++) {
-
- std::vector<int32> this_gselect;
- if (!gselect->empty()) this_gselect = (*gselect)[i];
- else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect);
- am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0, &per_frame_vars);
-
- gpost[i].gselect = this_gselect;
- gpost[i].tids.resize(posterior[i].size());
- gpost[i].posteriors.resize(posterior[i].size());
-
- prev_pdf_id = -1; // Only cache for the same frame.
- for (size_t j = 0; j < posterior[i].size(); j++) {
- int32 tid = posterior[i][j].first, // transition identifier.
- pdf_id = trans_model.TransitionIdToPdf(tid);
- BaseFloat weight = posterior[i][j].second;
- gpost[i].tids[j] = tid;
-
- if (pdf_id != prev_pdf_id) {
- // First time see this pdf-id for this frame, update the cached
- // variables.
- prev_pdf_id = pdf_id;
- prev_like = am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id,
- &prev_posterior);
- }
-
- gpost[i].posteriors[j] = prev_posterior;
- tot_like_this_file += prev_like * weight;
- tot_weight += weight;
- gpost[i].posteriors[j].Scale(weight);
- }
- }
-
- KALDI_LOG << "Average like for this file is "
- << (tot_like_this_file/posterior.size()) << " over "
- << posterior.size() <<" frames.";
- tot_like += tot_like_this_file;
- tot_t += posterior.size();
- if (num_done % 10 == 0)
- KALDI_LOG << "Avg like per frame so far is "
- << (tot_like/tot_t);
- gpost_writer.Write(utt, gpost);
- }
- }
-
- KALDI_LOG << "Overall like per frame (Gaussian only) = "
- << (tot_like/tot_t) << " over " << tot_t << " frames.";
-
- KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior
- << " with no posteriors, " << num_other_error
- << " with other errors.";
-
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-rescore-lattice.cc b/src/sgmmbin/sgmm-rescore-lattice.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// sgmmbin/sgmm-rescore-lattice.cc
-
-// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal)
-// Cisco Systems (Author: Neha Agrawal)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "util/stl-utils.h"
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-#include "sgmm/decodable-am-sgmm.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- typedef kaldi::int64 int64;
- using fst::SymbolTable;
- using fst::VectorFst;
- using fst::StdArc;
-
- const char *usage =
- "Replace the acoustic scores on a lattice using a new model.\n"
- "Usage: sgmm-rescore-lattice [options] <model-in> <lattice-rspecifier> "
- "<feature-rspecifier> <lattice-wspecifier>\n"
- " e.g.: sgmm-rescore-lattice 1.mdl ark:1.lats scp:trn.scp ark:2.lats\n";
-
- kaldi::BaseFloat old_acoustic_scale = 0.0;
- bool speedup = false;
- BaseFloat log_prune = 5.0;
- std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier;
- SgmmGselectConfig sgmm_opts;
- kaldi::ParseOptions po(usage);
- po.Register("old-acoustic-scale", &old_acoustic_scale,
- "Add the current acoustic scores with some scale.");
- po.Register("log-prune", &log_prune,
- "Pruning beam used to reduce number of exp() evaluations.");
- po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)");
- po.Register("utt2spk", &utt2spk_rspecifier,
- "rspecifier for utterance to speaker map");
- po.Register("gselect", &gselect_rspecifier,
- "Precomputed Gaussian indices (rspecifier)");
- po.Register("speedup", &speedup,
- "If true, enable a faster version of the computation that "
- "saves times when there is only one pdf-id on a single frame "
- "by only sometimes (randomly) computing the probabilities, and "
- "then scaling them up to preserve corpus-level diagnostics.");
-
- sgmm_opts.Register(&po);
-
- po.Read(argc, argv);
-
- if (po.NumArgs() != 4) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string model_filename = po.GetArg(1),
- lats_rspecifier = po.GetArg(2),
- feature_rspecifier = po.GetArg(3),
- lats_wspecifier = po.GetArg(4);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
- RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier,
- utt2spk_rspecifier);
- RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
- // Read as regular lattice
- SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
- // Write as compact lattice.
- CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
-
- int32 num_done = 0, num_err = 0;
- for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
- std::string utt = compact_lattice_reader.Key();
- if (!feature_reader.HasKey(utt)) {
- KALDI_WARN << "No feature found for utterance " << utt << ". Skipping";
- num_err++;
- continue;
- }
-
- CompactLattice clat = compact_lattice_reader.Value();
- compact_lattice_reader.FreeCurrent();
- if (old_acoustic_scale != 1.0)
- fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &clat);
-
- const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-
- // Get speaker vectors
- SgmmPerSpkDerivedVars spk_vars;
- if (spkvecs_reader.IsOpen()) {
- if (spkvecs_reader.HasKey(utt)) {
- spk_vars.v_s = spkvecs_reader.Value(utt);
- am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
- } else {
- KALDI_WARN << "Cannot find speaker vector for " << utt;
- num_err++;
- continue;
- }
- } // else spk_vars is "empty"
-
- bool have_gselect = !gselect_rspecifier.empty()
- && gselect_reader.HasKey(utt)
- && gselect_reader.Value(utt).size() == feats.NumRows();
- if (!gselect_rspecifier.empty() && !have_gselect)
- KALDI_WARN << "No Gaussian-selection info available for utterance "
- << utt << " (or wrong size)";
- std::vector<std::vector<int32> > empty_gselect;
- const std::vector<std::vector<int32> > *gselect =
- (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect);
-
- DecodableAmSgmm sgmm_decodable(sgmm_opts, am_sgmm, spk_vars,
- trans_model, feats, *gselect,
- log_prune);
-
- if (!speedup) {
- if (kaldi::RescoreCompactLattice(&sgmm_decodable, &clat)) {
- compact_lattice_writer.Write(utt, clat);
- num_done++;
- } else num_err++;
- } else {
- BaseFloat speedup_factor = 100.0;
- if (kaldi::RescoreCompactLatticeSpeedup(trans_model, speedup_factor,
- &sgmm_decodable,
- &clat)) {
- compact_lattice_writer.Write(utt, clat);
- num_done++;
- } else num_err++;
- }
- }
-
- KALDI_LOG << "Done " << num_done << " lattices, errors on "
- << num_err;
- return (num_done != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
diff --git a/src/sgmmbin/sgmm-sum-accs.cc b/src/sgmmbin/sgmm-sum-accs.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// sgmmbin/sgmm-sum-accs.cc
-
-// Copyright 2009-2011 Saarland University; Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "util/common-utils.h"
-#include "sgmm/estimate-am-sgmm.h"
-#include "hmm/transition-model.h"
-
-
-int main(int argc, char *argv[]) {
- try {
- typedef kaldi::int32 int32;
-
- const char *usage =
- "Sum multiple accumulated stats files for SGMM training.\n"
- "Usage: sgmm-sum-accs [options] stats-out stats-in1 stats-in2 ...\n";
-
- bool binary = true;
- kaldi::ParseOptions po(usage);
- po.Register("binary", &binary, "Write output in binary mode");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 2) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string stats_out_filename = po.GetArg(1);
- kaldi::Vector<double> transition_accs;
- kaldi::MleAmSgmmAccs sgmm_accs;
-
- for (int i = 2, max = po.NumArgs(); i <= max; i++) {
- std::string stats_in_filename = po.GetArg(i);
- bool binary_read;
- kaldi::Input ki(stats_in_filename, &binary_read);
- transition_accs.Read(ki.Stream(), binary_read, true /* add values */);
- sgmm_accs.Read(ki.Stream(), binary_read, true /* add values */);
- }
-
- // Write out the accs
- {
- kaldi::Output ko(stats_out_filename, binary);
- transition_accs.Write(ko.Stream(), binary);
- sgmm_accs.Write(ko.Stream(), binary);
- }
-
- KALDI_LOG << "Written stats to " << stats_out_filename;
- } catch(const std::exception &e) {
- std::cerr << e.what() << '\n';
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-sum-tree-stats.cc b/src/sgmmbin/sgmm-sum-tree-stats.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// sgmmbin/sgmm-sum-tree-stats.cc
-
-// Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "tree/build-tree-utils.h"
-#include "sgmm/sgmm-clusterable.h"
-
-
-int main(int argc, char *argv[]) {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- try {
- const char *usage =
- "Sum SGMM-type statistics used for phonetic decision tree building.\n"
- "Usage: sgmm-sum-tree-stats [options] tree-accs-out trea-accs-in1 tree-accs-in2 ...\n"
- "e.g.: sgmm-sum-tree-stats treeacc 1.streeacc 2.streeacc 3.streeacc\n";
-
- ParseOptions po(usage);
- bool binary = true;
-
- po.Register("binary", &binary, "Write output in binary mode");
- po.Read(argc, argv);
-
- if (po.NumArgs() < 2) {
- po.PrintUsage();
- exit(1);
- }
-
- std::string treeacc_wxfilename = po.GetArg(1);
-
- std::map<EventType, Clusterable*> tree_stats;
-
- AmSgmm am_sgmm; // dummy variable needed to initialize stats.
- std::vector<SpMatrix<double> > H; // also needed to initialize stats,
- // but never accessed in this program.
-
- // typedef std::vector<std::pair<EventType, Clusterable*> > BuildTreeStatsType;
- for (int32 arg = 2; arg <= po.NumArgs(); arg++) {
- std::string treeacc_rxfilename = po.GetArg(arg);
- bool binary_in;
- Input ki(treeacc_rxfilename, &binary_in);
- BuildTreeStatsType stats_array;
- SgmmClusterable example(am_sgmm, H); // Needed for its type information.
- ReadBuildTreeStats(ki.Stream(), binary_in, example, &stats_array);
- for (BuildTreeStatsType::iterator iter = stats_array.begin();
- iter != stats_array.end(); ++iter) {
- EventType e = iter->first;
- Clusterable *c = iter->second;
- std::map<EventType, Clusterable*>::iterator map_iter = tree_stats.find(e);
- if (map_iter == tree_stats.end()) { // Not already present.
- tree_stats[e] = c;
- } else {
- map_iter->second->Add(*c);
- delete c;
- }
- }
- }
-
- BuildTreeStatsType stats; // all the stats, in vectorized form.
-
- for (std::map<EventType, Clusterable*>::const_iterator iter = tree_stats.begin();
- iter != tree_stats.end();
- iter++ ) {
- stats.push_back(std::make_pair(iter->first, iter->second));
- }
- tree_stats.clear();
-
- {
- Output ko(treeacc_wxfilename, binary);
- WriteBuildTreeStats(ko.Stream(), binary, stats);
- }
- KALDI_LOG << "Wrote summed sgmm-treeaaccs: number of separate objects was "
- << stats.size();
- DeleteBuildTreeStats(&stats);
- return (stats.size() != 0 ? 0 : 1);
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
diff --git a/src/sgmmbin/sgmm-write-ubm.cc b/src/sgmmbin/sgmm-write-ubm.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// sgmmbin/sgmm-write-ubm.cc
-
-// Copyright 2009-2011 Microsoft Corporation
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-
-#include "sgmm/am-sgmm.h"
-#include "hmm/transition-model.h"
-
-int main(int argc, char *argv[]) {
- try {
- using namespace kaldi;
- typedef kaldi::int32 int32;
- const char *usage =
- "Write out the full-covariance UBM of the SGMM\n"
- "Usage: sgmm-write-ubm [options] <model-in> <ubm-out>\n"
- "e.g.: sgmm-write-ubm 1.mdl 1.ubm\n";
-
- bool binary_write = true;
-
- ParseOptions po(usage);
- po.Register("binary", &binary_write, "Write output in binary mode");
-
- po.Read(argc, argv);
- if (po.NumArgs() != 2) {
- po.PrintUsage();
- exit(1);
- }
- std::string model_in_filename = po.GetArg(1),
- ubm_out_filename = po.GetArg(2);
-
- AmSgmm am_sgmm;
- TransitionModel trans_model;
- {
- bool binary;
- Input ki(model_in_filename, &binary);
- trans_model.Read(ki.Stream(), binary);
- am_sgmm.Read(ki.Stream(), binary);
- }
-
- {
- Output ko(ubm_out_filename, binary_write);
- am_sgmm.full_ubm().Write(ko.Stream(), binary_write);
- }
-
- KALDI_LOG << "Written UBM to " << ubm_out_filename;
- return 0;
- } catch(const std::exception &e) {
- std::cerr << e.what();
- return -1;
- }
-}
-
-
index 817d0c65bc33ae7f6f42f0ceffb276fe7c174462..d19e17f6b68acf9da191d759aa23787594c54969 100644 (file)
namespace kaldi {
-// Note: see sgmm/sgmm-clusterable.h for an SGMM-based clusterable
-// class. We didn't include it here, to avoid adding an extra
-// dependency to this directory.
-
/// \addtogroup clustering_group
/// @{