From: Xiaohui Zhang Date: Tue, 15 Aug 2017 05:34:23 +0000 (-0400) Subject: [scripts,src] Check that symbol '#0' is not in the vocab of the ARPA LM file or the... X-Git-Url: https://git.ti.com/gitweb?p=processor-sdk%2Fkaldi.git;a=commitdiff_plain;h=12fa929fc831e7b540fc332393042d910dd358a8;ds=sidebyside [scripts,src] Check that symbol '#0' is not in the vocab of the ARPA LM file or the lexicon [thanks: nitzan@almagu.com] (#1806) --- diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl index 5c089fcd0..a5c9ff8da 100755 --- a/egs/wsj/s5/utils/validate_dict_dir.pl +++ b/egs/wsj/s5/utils/validate_dict_dir.pl @@ -186,7 +186,7 @@ sub check_lexicon { if (!defined $word) { print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); } - if ($word eq "" || $word eq "" || $word eq "") { + if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { print "--> ERROR: lexicon.txt contains forbidden word $word\n"; set_to_fail(); } diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc index 634a6267c..c854b077d 100644 --- a/src/lm/arpa-lm-compiler.cc +++ b/src/lm/arpa-lm-compiler.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Gilles Boulianne // Copyright 2016 Smart Action LLC (kkm) +// Copyright 2017 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -194,6 +195,9 @@ void ArpaLmCompilerImpl::ConsumeNGram(const NGram &ngram, StateId dest; Symbol sym = ngram.words.back(); float weight = -ngram.logprob; + if (sym == sub_eps_ || sym == 0) { + KALDI_ERR << " or disambiguation symbol " << sym << "found in the ARPA file. "; + } if (sym == eos_symbol_) { if (sub_eps_ == 0) { // Keep as a real symbol when not substituting.