1 #!/usr/bin/env perl
3 # Apache 2.0.
4 # Guoguo Chen (guoguo@jhu.edu)
5 # Daniel Povey (dpovey@gmail.com)
6 #
7 # Validation script for data/local/dict
10 if(@ARGV != 1) {
11 die "Usage: validate_dict_dir.pl dict_directory\n";
12 }
14 $dict = shift @ARGV;
15 $dict =~ s:/$::;
17 $exit = 0;
18 $success = 1; # this is re-set each time we read a file.
20 sub set_to_fail { $exit = 1; $success = 0; }
22 # Checking silence_phones.txt -------------------------------
23 print "Checking $dict/silence_phones.txt ...\n";
24 if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
25 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
26 $idx = 1;
27 %silence = ();
28 $crlf = 1;
30 print "--> reading $dict/silence_phones.txt\n";
31 while(<S>) {
32 if (! s/\n$//) {
33 print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
34 set_to_fail();
35 }
36 if ($crlf == 1 && m/\r/) {
37 print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
38 set_to_fail();
39 $crlf = 0;
40 }
41 my @col = split(" ", $_);
42 if (@col == 0) {
43 set_to_fail();
44 print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
45 }
46 foreach(0 .. @col-1) {
47 my $p = $col[$_];
48 if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
49 else {$silence{$p} = 1;}
50 if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
51 set_to_fail();
52 print "--> ERROR: phone \"$p\" has disallowed written form\n";
54 }
55 }
56 $idx ++;
57 }
58 close(S);
59 $success == 0 || print "--> $dict/silence_phones.txt is OK\n";
60 print "\n";
62 # Checking optional_silence.txt -------------------------------
63 print "Checking $dict/optional_silence.txt ...\n";
64 if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
65 if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
66 $idx = 1;
67 $success = 1;
68 $crlf = 1;
69 print "--> reading $dict/optional_silence.txt\n";
70 while(<OS>) {
71 chomp;
72 my @col = split(" ", $_);
73 if ($idx > 1 or @col > 1) {
74 set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
75 } elsif (!$silence{$col[0]}) {
76 set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
77 }
78 if ($crlf == 1 && m/\r/) {
79 print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
80 set_to_fail();
81 $crlf = 0;
82 }
83 $idx ++;
84 }
85 close(OS);
86 $success == 0 || print "--> $dict/optional_silence.txt is OK\n";
87 print "\n";
89 # Checking nonsilence_phones.txt -------------------------------
90 print "Checking $dict/nonsilence_phones.txt ...\n";
91 if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
92 if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
93 $idx = 1;
94 %nonsilence = ();
95 $success = 1;
96 $crlf = 1;
97 print "--> reading $dict/nonsilence_phones.txt\n";
98 while(<NS>) {
99 if ($crlf == 1 && m/\r/) {
100 print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
101 set_to_fail();
102 $crlf = 0;
103 }
104 if (! s/\n$//) {
105 print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
106 set_to_fail();
107 }
108 my @col = split(" ", $_);
109 if (@col == 0) {
110 set_to_fail();
111 print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
112 }
113 foreach(0 .. @col-1) {
114 my $p = $col[$_];
115 if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
116 else {$nonsilence{$p} = 1;}
117 if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
118 set_to_fail();
119 print "--> ERROR: phone \"$p\" has disallowed written form\n";
121 }
122 }
123 $idx ++;
124 }
125 close(NS);
126 $success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
127 print "\n";
129 # Checking disjoint -------------------------------
130 sub intersect {
131 my ($a, $b) = @_;
132 @itset = ();
133 %itset = ();
134 foreach(keys %$a) {
135 if(exists $b->{$_} and !$itset{$_}) {
136 push(@itset, $_);
137 $itset{$_} = 1;
138 }
139 }
140 return @itset;
141 }
143 print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
144 @itset = intersect(\%silence, \%nonsilence);
145 if(@itset == 0) {print "--> disjoint property is OK.\n";}
146 else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
147 print "\n";
150 sub check_lexicon {
151 my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
152 print "Checking $lex\n";
153 !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
154 my %seen_line = {};
155 $idx = 1; $success = 1; $crlf = 1;
156 print "--> reading $lex\n";
157 while (<L>) {
158 if ($crlf == 1 && m/\r/) {
159 print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
160 set_to_fail();
161 $crlf = 0;
162 }
163 if (defined $seen_line{$_}) {
164 print "--> ERROR: line '$_' of $lex is repeated\n";
165 set_to_fail();
166 }
167 $seen_line{$_} = 1;
168 if (! s/\n$//) {
169 print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
170 set_to_fail();
171 }
172 my @col = split(" ", $_);
173 $word = shift @col;
174 if (!defined $word) {
175 print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
176 }
177 if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>") {
178 print "--> ERROR: lexicon.txt contains forbidden word $word\n";
179 set_to_fail();
180 }
181 for ($n = 0; $n < $num_prob_cols; $n++) {
182 $prob = shift @col;
183 if (!($prob > 0.0 && $prob <= 1.0)) {
184 print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
185 set_to_fail();
186 }
187 }
188 for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
189 if (@col == 0) {
190 print "--> ERROR: lexicon.txt contains word $word with empty ";
191 print "pronunciation.\n";
192 set_to_fail();
193 }
194 foreach (0 .. @col-1) {
195 if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
196 print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
197 print "(line $idx)\n";
198 set_to_fail();
199 }
200 }
201 $idx ++;
202 }
203 close(L);
204 $success == 0 || print "--> $lex is OK\n";
205 print "\n";
206 }
208 if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
209 if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
210 if (-f "$dict/lexiconp_silprob.txt") {
211 # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
212 # exist.
213 check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
214 if (-f "$dict/silprob.txt") {
215 !open(SP, "<$dict/silprob.txt") &&
216 print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
217 $crlf = 1;
218 while (<SP>) {
219 if ($crlf == 1 && m/\r/) {
220 print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
221 set_to_fail();
222 $crlf = 0;
223 }
224 chomp; my @col = split;
225 @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
226 if ($col[0] eq "<s>" || $col[0] eq "overall") {
227 if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
228 set_to_fail();
229 print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
230 }
231 } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
232 if ($col[1] <= 0.0) {
233 set_to_fail();
234 print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
235 }
236 } else {
237 print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
238 set_to_fail();
239 }
240 }
241 close(SP);
242 } else {
243 set_to_fail();
244 print "--> ERROR: expecting $dict/silprob.txt to exist\n";
245 }
246 }
248 if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
249 print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
250 set_to_fail();
251 }
253 sub check_lexicon_pair {
254 my ($lex1, $num_prob_cols1, $num_skipped_cols1,
255 $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
256 # We have checked individual lexicons already.
257 open(L1, "<$lex1"); open(L2, "<$lex2");
258 print "Checking lexicon pair $lex1 and $lex2\n";
259 my $line_num = 0;
260 while(<L1>) {
261 $line_num++;
262 @A = split;
263 $line_B = <L2>;
264 if (!defined $line_B) {
265 print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
266 set_to_fail(); last;
267 }
268 @B = split(" ", $line_B);
269 # Check if the word matches.
270 if ($A[0] ne $B[0]) {
271 print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
272 set_to_fail(); last;
273 }
274 shift @A; shift @B;
275 for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
276 for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
277 # Check if the pronunciation matches
278 if (join(" ", @A) ne join(" ", @B)) {
279 print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
280 set_to_fail(); last;
281 }
282 }
283 $line_B = <L2>;
284 if (defined $line_B && $exit == 0) {
285 print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
286 set_to_fail();
287 }
288 $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
289 }
291 # If more than one lexicon exist, we have to check if they correspond to each
292 # other. It could be that the user overwrote one and we need to regenerate the
293 # other, but we do not know which is which.
294 if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
295 check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
296 }
297 if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
298 check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
299 "$dict/lexiconp_silprob.txt", 2, 2);
300 }
302 # Checking extra_questions.txt -------------------------------
303 %distinguished = (); # Keep track of all phone-pairs including nonsilence that
304 # are distinguished (split apart) by extra_questions.txt,
305 # as $distinguished{$p1,$p2} = 1. This will be used to
306 # make sure that we don't have pairs of phones on the same
307 # line in nonsilence_phones.txt that can never be
308 # distinguished from each other by questions. (If any two
309 # phones appear on the same line in nonsilence_phones.txt,
310 # they share a tree root, and since the automatic
311 # question-building treats all phones that appear on the
312 # same line of nonsilence_phones.txt as being in the same
313 # group, we can never distinguish them without resorting to
314 # questions in extra_questions.txt.
315 print "Checking $dict/extra_questions.txt ...\n";
316 if (-s "$dict/extra_questions.txt") {
317 if (!open(EX, "<$dict/extra_questions.txt")) {
318 set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
319 }
320 $idx = 1;
321 $success = 1;
322 $crlf = 1;
323 print "--> reading $dict/extra_questions.txt\n";
324 while(<EX>) {
325 if ($crlf == 1 && m/\r/) {
326 print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
327 set_to_fail();
328 $crlf = 0;
329 }
330 if (! s/\n$//) {
331 print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
332 set_to_fail();
333 }
334 my @col = split(" ", $_);
335 if (@col == 0) {
336 set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
337 }
338 foreach (0 .. @col-1) {
339 if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
340 set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
341 }
342 $idx ++;
343 }
344 %col_hash = ();
345 foreach $p (@col) { $col_hash{$p} = 1; }
346 foreach $p1 (@col) {
347 # Update %distinguished hash.
348 foreach $p2 (keys %nonsilence) {
349 if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
350 # in this question (and in nonsilence
351 # phones)... mark p1,p2 as being split apart
352 $distinguished{$p1,$p2} = 1;
353 $distinguished{$p2,$p1} = 1;
354 }
355 }
356 }
357 }
358 close(EX);
359 $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
360 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
363 # check nonsilence_phones.txt again for phone-pairs that are never
364 # distnguishable. (note: this situation is normal and expected for silence
365 # phones, so we don't check it.)
366 if(!open(NS, "<$dict/nonsilence_phones.txt")) {
367 print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
368 }
370 $num_warn_nosplit = 0;
371 $num_warn_nosplit_limit = 10;
372 while(<NS>) {
373 my @col = split(" ", $_);
374 foreach $p1 (@col) {
375 foreach $p2 (@col) {
376 if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
377 set_to_fail();
378 if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
379 print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
380 }
381 if ($num_warn_nosplit == $num_warn_nosplit_limit) {
382 print "... Not warning any more times about this issue.\n";
383 }
384 if ($num_warn_nosplit == 0) {
385 print " (note: we started checking for this only recently. You can still build a system but\n";
386 print " phones $p1 and $p2 will be acoustically indistinguishable).\n";
387 }
388 $num_warn_nosplit++;
389 }
390 }
391 }
392 }
395 if ($exit == 1) {
396 print "--> ERROR validating dictionary directory $dict (see detailed error ";
397 print "messages above)\n\n";
398 exit 1;
399 } else {
400 print "--> SUCCESS [validating dictionary directory $dict]\n\n";
401 }
403 exit 0;