a5c9ff8da21b519bc3aac69a875f6f88d6095304
1 #!/usr/bin/env perl
3 # Apache 2.0.
4 # Guoguo Chen (guoguo@jhu.edu)
5 # Daniel Povey (dpovey@gmail.com)
6 #
7 # Validation script for data/local/dict
10 if(@ARGV != 1) {
11 die "Usage: validate_dict_dir.pl <dict-dir>\n" .
12 "e.g.: validate_dict_dir.pl data/local/dict\n";
13 }
15 $dict = shift @ARGV;
16 $dict =~ s:/$::;
18 $exit = 0;
19 $success = 1; # this is re-set each time we read a file.
21 sub set_to_fail { $exit = 1; $success = 0; }
23 # Checking silence_phones.txt -------------------------------
24 print "Checking $dict/silence_phones.txt ...\n";
25 if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
26 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
27 $idx = 1;
28 %silence = ();
29 $crlf = 1;
31 print "--> reading $dict/silence_phones.txt\n";
32 while(<S>) {
33 if (! s/\n$//) {
34 print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
35 set_to_fail();
36 }
37 if ($crlf == 1 && m/\r/) {
38 print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
39 set_to_fail();
40 $crlf = 0;
41 }
42 my @col = split(" ", $_);
43 if (@col == 0) {
44 set_to_fail();
45 print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
46 }
47 foreach(0 .. @col-1) {
48 my $p = $col[$_];
49 if($silence{$p}) {
50 set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n";
51 } else {
52 $silence{$p} = 1;
53 }
54 # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
55 # problems with word-position-dependent systems, and <eps> is obviously
56 # confusable with epsilon.
57 if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
58 set_to_fail();
59 print "--> ERROR: phone \"$p\" has disallowed written form\n";
60 }
61 }
62 $idx ++;
63 }
64 close(S);
65 $success == 0 || print "--> $dict/silence_phones.txt is OK\n";
66 print "\n";
68 # Checking optional_silence.txt -------------------------------
69 print "Checking $dict/optional_silence.txt ...\n";
70 if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
71 if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
72 $idx = 1;
73 $success = 1;
74 $crlf = 1;
75 print "--> reading $dict/optional_silence.txt\n";
76 while(<OS>) {
77 chomp;
78 my @col = split(" ", $_);
79 if ($idx > 1 or @col > 1) {
80 set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
81 } elsif (!$silence{$col[0]}) {
82 set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
83 }
84 if ($crlf == 1 && m/\r/) {
85 print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
86 set_to_fail();
87 $crlf = 0;
88 }
89 $idx ++;
90 }
91 close(OS);
92 $success == 0 || print "--> $dict/optional_silence.txt is OK\n";
93 print "\n";
95 # Checking nonsilence_phones.txt -------------------------------
96 print "Checking $dict/nonsilence_phones.txt ...\n";
97 if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
98 if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
99 $idx = 1;
100 %nonsilence = ();
101 $success = 1;
102 $crlf = 1;
103 print "--> reading $dict/nonsilence_phones.txt\n";
104 while(<NS>) {
105 if ($crlf == 1 && m/\r/) {
106 print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
107 set_to_fail();
108 $crlf = 0;
109 }
110 if (! s/\n$//) {
111 print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
112 set_to_fail();
113 }
114 my @col = split(" ", $_);
115 if (@col == 0) {
116 set_to_fail();
117 print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
118 }
119 foreach(0 .. @col-1) {
120 my $p = $col[$_];
121 if($nonsilence{$p}) {
122 set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n";
123 } else {
124 $nonsilence{$p} = 1;
125 }
126 # phones that start with the pound sign/hash may be mistaken for
127 # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
128 # problems with word-position-dependent systems, and <eps> is obviously
129 # confusable with epsilon.
130 if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
131 set_to_fail();
132 print "--> ERROR: phone \"$p\" has disallowed written form\n";
133 }
134 }
135 $idx ++;
136 }
137 close(NS);
138 $success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
139 print "\n";
141 # Checking disjoint -------------------------------
142 sub intersect {
143 my ($a, $b) = @_;
144 @itset = ();
145 %itset = ();
146 foreach(keys %$a) {
147 if(exists $b->{$_} and !$itset{$_}) {
148 push(@itset, $_);
149 $itset{$_} = 1;
150 }
151 }
152 return @itset;
153 }
155 print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
156 @itset = intersect(\%silence, \%nonsilence);
157 if(@itset == 0) {print "--> disjoint property is OK.\n";}
158 else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
159 print "\n";
162 sub check_lexicon {
163 my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
164 print "Checking $lex\n";
165 !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
166 my %seen_line = {};
167 $idx = 1; $success = 1; $crlf = 1;
168 print "--> reading $lex\n";
169 while (<L>) {
170 if ($crlf == 1 && m/\r/) {
171 print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
172 set_to_fail();
173 $crlf = 0;
174 }
175 if (defined $seen_line{$_}) {
176 print "--> ERROR: line '$_' of $lex is repeated\n";
177 set_to_fail();
178 }
179 $seen_line{$_} = 1;
180 if (! s/\n$//) {
181 print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
182 set_to_fail();
183 }
184 my @col = split(" ", $_);
185 $word = shift @col;
186 if (!defined $word) {
187 print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
188 }
189 if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
190 print "--> ERROR: lexicon.txt contains forbidden word $word\n";
191 set_to_fail();
192 }
193 for ($n = 0; $n < $num_prob_cols; $n++) {
194 $prob = shift @col;
195 if (!($prob > 0.0 && $prob <= 1.0)) {
196 print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
197 set_to_fail();
198 }
199 }
200 for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
201 if (@col == 0) {
202 print "--> ERROR: lexicon.txt contains word $word with empty ";
203 print "pronunciation.\n";
204 set_to_fail();
205 }
206 foreach (0 .. @col-1) {
207 if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
208 print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
209 print "(line $idx)\n";
210 set_to_fail();
211 }
212 }
213 $idx ++;
214 }
215 close(L);
216 $success == 0 || print "--> $lex is OK\n";
217 print "\n";
218 }
220 if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
221 if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
222 if (-f "$dict/lexiconp_silprob.txt") {
223 # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
224 # exist.
225 check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
226 if (-f "$dict/silprob.txt") {
227 !open(SP, "<$dict/silprob.txt") &&
228 print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
229 $crlf = 1;
230 while (<SP>) {
231 if ($crlf == 1 && m/\r/) {
232 print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
233 set_to_fail();
234 $crlf = 0;
235 }
236 chomp; my @col = split;
237 @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
238 if ($col[0] eq "<s>" || $col[0] eq "overall") {
239 if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
240 set_to_fail();
241 print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
242 }
243 } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
244 if ($col[1] <= 0.0) {
245 set_to_fail();
246 print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
247 }
248 } else {
249 print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
250 set_to_fail();
251 }
252 }
253 close(SP);
254 } else {
255 set_to_fail();
256 print "--> ERROR: expecting $dict/silprob.txt to exist\n";
257 }
258 }
260 if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
261 print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
262 set_to_fail();
263 }
265 sub check_lexicon_pair {
266 my ($lex1, $num_prob_cols1, $num_skipped_cols1,
267 $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
268 # We have checked individual lexicons already.
269 open(L1, "<$lex1"); open(L2, "<$lex2");
270 print "Checking lexicon pair $lex1 and $lex2\n";
271 my $line_num = 0;
272 while(<L1>) {
273 $line_num++;
274 @A = split;
275 $line_B = <L2>;
276 if (!defined $line_B) {
277 print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
278 set_to_fail(); last;
279 }
280 @B = split(" ", $line_B);
281 # Check if the word matches.
282 if ($A[0] ne $B[0]) {
283 print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
284 set_to_fail(); last;
285 }
286 shift @A; shift @B;
287 for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
288 for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
289 # Check if the pronunciation matches
290 if (join(" ", @A) ne join(" ", @B)) {
291 print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
292 set_to_fail(); last;
293 }
294 }
295 $line_B = <L2>;
296 if (defined $line_B && $exit == 0) {
297 print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
298 set_to_fail();
299 }
300 $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
301 }
303 # If more than one lexicon exist, we have to check if they correspond to each
304 # other. It could be that the user overwrote one and we need to regenerate the
305 # other, but we do not know which is which.
306 if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
307 check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
308 }
309 if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
310 check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
311 "$dict/lexiconp_silprob.txt", 2, 2);
312 }
314 # Checking extra_questions.txt -------------------------------
315 %distinguished = (); # Keep track of all phone-pairs including nonsilence that
316 # are distinguished (split apart) by extra_questions.txt,
317 # as $distinguished{$p1,$p2} = 1. This will be used to
318 # make sure that we don't have pairs of phones on the same
319 # line in nonsilence_phones.txt that can never be
320 # distinguished from each other by questions. (If any two
321 # phones appear on the same line in nonsilence_phones.txt,
322 # they share a tree root, and since the automatic
323 # question-building treats all phones that appear on the
324 # same line of nonsilence_phones.txt as being in the same
325 # group, we can never distinguish them without resorting to
326 # questions in extra_questions.txt.
327 print "Checking $dict/extra_questions.txt ...\n";
328 if (-s "$dict/extra_questions.txt") {
329 if (!open(EX, "<$dict/extra_questions.txt")) {
330 set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
331 }
332 $idx = 1;
333 $success = 1;
334 $crlf = 1;
335 print "--> reading $dict/extra_questions.txt\n";
336 while(<EX>) {
337 if ($crlf == 1 && m/\r/) {
338 print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
339 set_to_fail();
340 $crlf = 0;
341 }
342 if (! s/\n$//) {
343 print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
344 set_to_fail();
345 }
346 my @col = split(" ", $_);
347 if (@col == 0) {
348 set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
349 }
350 foreach (0 .. @col-1) {
351 if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
352 set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
353 }
354 $idx ++;
355 }
356 %col_hash = ();
357 foreach $p (@col) { $col_hash{$p} = 1; }
358 foreach $p1 (@col) {
359 # Update %distinguished hash.
360 foreach $p2 (keys %nonsilence) {
361 if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
362 # in this question (and in nonsilence
363 # phones)... mark p1,p2 as being split apart
364 $distinguished{$p1,$p2} = 1;
365 $distinguished{$p2,$p1} = 1;
366 }
367 }
368 }
369 }
370 close(EX);
371 $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
372 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
375 # check nonsilence_phones.txt again for phone-pairs that are never
376 # distnguishable. (note: this situation is normal and expected for silence
377 # phones, so we don't check it.)
378 if(!open(NS, "<$dict/nonsilence_phones.txt")) {
379 print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
380 }
382 $num_warn_nosplit = 0;
383 $num_warn_nosplit_limit = 10;
384 while(<NS>) {
385 my @col = split(" ", $_);
386 foreach $p1 (@col) {
387 foreach $p2 (@col) {
388 if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
389 set_to_fail();
390 if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
391 print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
392 }
393 if ($num_warn_nosplit == $num_warn_nosplit_limit) {
394 print "... Not warning any more times about this issue.\n";
395 }
396 if ($num_warn_nosplit == 0) {
397 print " (note: we started checking for this only recently. You can still build a system but\n";
398 print " phones $p1 and $p2 will be acoustically indistinguishable).\n";
399 }
400 $num_warn_nosplit++;
401 }
402 }
403 }
404 }
407 if ($exit == 1) {
408 print "--> ERROR validating dictionary directory $dict (see detailed error ";
409 print "messages above)\n\n";
410 exit 1;
411 } else {
412 print "--> SUCCESS [validating dictionary directory $dict]\n\n";
413 }
415 exit 0;