1 #!/usr/bin/env perl
3 # Apache 2.0.
4 # Guoguo Chen (guoguo@jhu.edu)
5 # Daniel Povey (dpovey@gmail.com)
6 #
7 # Validation script for data/local/dict
10 if(@ARGV != 1) {
11 die "Usage: validate_dict_dir.pl dict_directory\n";
12 }
14 $dict = shift @ARGV;
15 $dict =~ s:/$::;
17 $exit = 0;
18 $success = 1; # this is re-set each time we read a file.
20 sub set_to_fail { $exit = 1; $success = 0; }
22 # Checking silence_phones.txt -------------------------------
23 print "Checking $dict/silence_phones.txt ...\n";
24 if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
25 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
26 $idx = 1;
27 %silence = ();
29 print "--> reading $dict/silence_phones.txt\n";
30 while(<S>) {
31 if (! s/\n$//) {
32 print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
33 set_to_fail();
34 }
35 my @col = split(" ", $_);
36 if (@col == 0) {
37 set_to_fail();
38 print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
39 }
40 foreach(0 .. @col-1) {
41 my $p = $col[$_];
42 if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
43 else {$silence{$p} = 1;}
44 if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
45 set_to_fail();
46 print "--> ERROR: phone \"$p\" has disallowed written form\n";
48 }
49 }
50 $idx ++;
51 }
52 close(S);
53 $success == 0 || print "--> $dict/silence_phones.txt is OK\n";
54 print "\n";
56 # Checking optional_silence.txt -------------------------------
57 print "Checking $dict/optional_silence.txt ...\n";
58 if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
59 if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
60 $idx = 1;
61 $success = 1;
62 print "--> reading $dict/optional_silence.txt\n";
63 while(<OS>) {
64 chomp;
65 my @col = split(" ", $_);
66 if ($idx > 1 or @col > 1) {
67 set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
68 } elsif (!$silence{$col[0]}) {
69 set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
70 }
71 $idx ++;
72 }
73 close(OS);
74 $success == 0 || print "--> $dict/optional_silence.txt is OK\n";
75 print "\n";
77 # Checking nonsilence_phones.txt -------------------------------
78 print "Checking $dict/nonsilence_phones.txt ...\n";
79 if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
80 if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
81 $idx = 1;
82 %nonsilence = ();
83 $success = 1;
84 print "--> reading $dict/nonsilence_phones.txt\n";
85 while(<NS>) {
86 if (! s/\n$//) {
87 print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
88 set_to_fail();
89 }
90 my @col = split(" ", $_);
91 if (@col == 0) {
92 set_to_fail();
93 print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
94 }
95 foreach(0 .. @col-1) {
96 my $p = $col[$_];
97 if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
98 else {$nonsilence{$p} = 1;}
99 if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
100 set_to_fail();
101 print "--> ERROR: phone \"$p\" has disallowed written form\n";
103 }
104 }
105 $idx ++;
106 }
107 close(NS);
108 $success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
109 print "\n";
111 # Checking disjoint -------------------------------
112 sub intersect {
113 my ($a, $b) = @_;
114 @itset = ();
115 %itset = ();
116 foreach(keys %$a) {
117 if(exists $b->{$_} and !$itset{$_}) {
118 push(@itset, $_);
119 $itset{$_} = 1;
120 }
121 }
122 return @itset;
123 }
125 print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
126 @itset = intersect(\%silence, \%nonsilence);
127 if(@itset == 0) {print "--> disjoint property is OK.\n";}
128 else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
129 print "\n";
132 sub check_lexicon {
133 my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
134 print "Checking $lex\n";
135 !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
136 my %seen_line = {};
137 $idx = 1; $success = 1;
138 print "--> reading $lex\n";
139 while (<L>) {
140 if (defined $seen_line{$_}) {
141 print "--> ERROR: line '$_' of $lex is repeated\n";
142 set_to_fail();
143 }
144 $seen_line{$_} = 1;
145 if (! s/\n$//) {
146 print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
147 set_to_fail();
148 }
149 my @col = split(" ", $_);
150 $word = shift @col;
151 if (!defined $word) {
152 print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
153 }
154 if ($word eq "<s>" || $word eq "</s>") {
155 print "--> ERROR: lexicon.txt contains forbidden word $word\n";
156 set_to_fail();
157 }
158 for ($n = 0; $n < $num_prob_cols; $n++) {
159 $prob = shift @col;
160 if (!($prob > 0.0 && $prob <= 1.0)) {
161 print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
162 set_to_fail();
163 }
164 }
165 for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
166 if (@col == 0) {
167 print "--> ERROR: lexicon.txt contains word $word with empty ";
168 print "pronunciation.\n";
169 set_to_fail();
170 }
171 foreach (0 .. @col-1) {
172 if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
173 print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
174 print "(line $idx)\n";
175 set_to_fail();
176 }
177 }
178 $idx ++;
179 }
180 close(L);
181 $success == 0 || print "--> $lex is OK\n";
182 print "\n";
183 }
185 if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
186 if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
187 if (-f "$dict/lexiconp_silprob.txt") {
188 # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
189 # exist.
190 check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
191 if (-f "$dict/silprob.txt") {
192 !open(SP, "<$dict/silprob.txt") &&
193 print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
194 while (<SP>) {
195 chomp; my @col = split;
196 @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
197 if ($col[0] eq "<s>" || $col[0] eq "overall") {
198 if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
199 set_to_fail();
200 print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
201 }
202 } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
203 if ($col[1] <= 0.0) {
204 set_to_fail();
205 print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
206 }
207 } else {
208 print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
209 set_to_fail();
210 }
211 }
212 close(SP);
213 } else {
214 set_to_fail();
215 print "--> ERROR: expecting $dict/silprob.txt to exist\n";
216 }
217 }
219 if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
220 print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
221 set_to_fail();
222 }
224 sub check_lexicon_pair {
225 my ($lex1, $num_prob_cols1, $num_skipped_cols1,
226 $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
227 # We have checked individual lexicons already.
228 open(L1, "<$lex1"); open(L2, "<$lex2");
229 print "Checking lexicon pair $lex1 and $lex2\n";
230 my $line_num = 0;
231 while(<L1>) {
232 $line_num++;
233 @A = split;
234 $line_B = <L2>;
235 if (!defined $line_B) {
236 print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
237 set_to_fail(); last;
238 }
239 @B = split(" ", $line_B);
240 # Check if the word matches.
241 if ($A[0] ne $B[0]) {
242 print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
243 set_to_fail(); last;
244 }
245 shift @A; shift @B;
246 for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
247 for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
248 # Check if the pronunciation matches
249 if (join(" ", @A) ne join(" ", @B)) {
250 print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
251 set_to_fail(); last;
252 }
253 }
254 $line_B = <L2>;
255 if (defined $line_B && $exit == 0) {
256 print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
257 set_to_fail();
258 }
259 $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
260 }
262 # If more than one lexicon exist, we have to check if they correspond to each
263 # other. It could be that the user overwrote one and we need to regenerate the
264 # other, but we do not know which is which.
265 if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
266 check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
267 }
268 if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
269 check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
270 "$dict/lexiconp_silprob.txt", 2, 2);
271 }
273 # Checking extra_questions.txt -------------------------------
274 %distinguished = (); # Keep track of all phone-pairs including nonsilence that
275 # are distinguished (split apart) by extra_questions.txt,
276 # as $distinguished{$p1,$p2} = 1. This will be used to
277 # make sure that we don't have pairs of phones on the same
278 # line in nonsilence_phones.txt that can never be
279 # distinguished from each other by questions. (If any two
280 # phones appear on the same line in nonsilence_phones.txt,
281 # they share a tree root, and since the automatic
282 # question-building treats all phones that appear on the
283 # same line of nonsilence_phones.txt as being in the same
284 # group, we can never distinguish them without resorting to
285 # questions in extra_questions.txt.
286 print "Checking $dict/extra_questions.txt ...\n";
287 if (-s "$dict/extra_questions.txt") {
288 if (!open(EX, "<$dict/extra_questions.txt")) {
289 set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
290 }
291 $idx = 1;
292 $success = 1;
293 print "--> reading $dict/extra_questions.txt\n";
294 while(<EX>) {
295 if (! s/\n$//) {
296 print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
297 set_to_fail();
298 }
299 my @col = split(" ", $_);
300 if (@col == 0) {
301 set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
302 }
303 foreach (0 .. @col-1) {
304 if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
305 set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
306 }
307 $idx ++;
308 }
309 %col_hash = ();
310 foreach $p (@col) { $col_hash{$p} = 1; }
311 foreach $p1 (@col) {
312 # Update %distinguished hash.
313 foreach $p2 (keys %nonsilence) {
314 if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
315 # in this question (and in nonsilence
316 # phones)... mark p1,p2 as being split apart
317 $distinguished{$p1,$p2} = 1;
318 $distinguished{$p2,$p1} = 1;
319 }
320 }
321 }
322 }
323 close(EX);
324 $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
325 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
328 # check nonsilence_phones.txt again for phone-pairs that are never
329 # distnguishable. (note: this situation is normal and expected for silence
330 # phones, so we don't check it.)
331 if(!open(NS, "<$dict/nonsilence_phones.txt")) {
332 print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
333 }
335 $num_warn_nosplit = 0;
336 $num_warn_nosplit_limit = 10;
337 while(<NS>) {
338 my @col = split(" ", $_);
339 foreach $p1 (@col) {
340 foreach $p2 (@col) {
341 if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
342 set_to_fail();
343 if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
344 print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
345 }
346 if ($num_warn_nosplit == $num_warn_nosplit_limit) {
347 print "... Not warning any more times about this issue.\n";
348 }
349 if ($num_warn_nosplit == 0) {
350 print " (note: we started checking for this only recently. You can still build a system but\n";
351 print " phones $p1 and $p2 will be acoustically indistinguishable).\n";
352 }
353 $num_warn_nosplit++;
354 }
355 }
356 }
357 }
360 if ($exit == 1) {
361 print "--> ERROR validating dictionary directory $dict (see detailed error ";
362 print "messages above)\n\n";
363 exit 1;
364 } else {
365 print "--> SUCCESS [validating dictionary directory $dict]\n\n";
366 }
368 exit 0;