1 #!/usr/bin/perl
3 # Apache 2.0.
4 # Guoguo Chen (guoguo@jhu.edu)
5 # Daniel Povey (dpovey@gmail.com)
6 #
7 # Validation script for data/local/dict
10 if(@ARGV != 1) {
11 die "Usage: validate_dict_dir.pl dict_directory\n";
12 }
14 $dict = shift @ARGV;
15 $dict =~ s:/$::;
17 $exit = 0;
18 $success = 1; # this is re-set each time we read a file.
20 sub set_to_fail { $exit = 1; $success = 0; }
22 # Checking silence_phones.txt -------------------------------
23 print "Checking $dict/silence_phones.txt ...\n";
24 if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
25 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
26 $idx = 1;
27 %silence = ();
29 print "--> reading $dict/silence_phones.txt\n";
30 while(<S>) {
31 if (! s/\n$//) {
32 print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
33 set_to_fail();
34 }
35 my @col = split(" ", $_);
36 if (@col == 0) {
37 set_to_fail();
38 print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
39 }
40 foreach(0 .. @col-1) {
41 my $p = $col[$_];
42 if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
43 else {$silence{$p} = 1;}
44 if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
45 set_to_fail();
46 print "--> ERROR: phone \"$p\" has disallowed written form";
48 }
49 }
50 $idx ++;
51 }
52 close(S);
53 $success == 0 || print "--> $dict/silence_phones.txt is OK\n";
54 print "\n";
56 # Checking optional_silence.txt -------------------------------
57 print "Checking $dict/optional_silence.txt ...\n";
58 if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
59 if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
60 $idx = 1;
61 $success = 1;
62 print "--> reading $dict/optional_silence.txt\n";
63 while(<OS>) {
64 chomp;
65 my @col = split(" ", $_);
66 if ($idx > 1 or @col > 1) {
67 set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
68 } elsif (!$silence{$col[0]}) {
69 set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
70 }
71 $idx ++;
72 }
73 close(OS);
74 $success == 0 || print "--> $dict/optional_silence.txt is OK\n";
75 print "\n";
77 # Checking nonsilence_phones.txt -------------------------------
78 print "Checking $dict/nonsilence_phones.txt ...\n";
79 if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
80 if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
81 $idx = 1;
82 %nonsilence = ();
83 $success = 1;
84 print "--> reading $dict/nonsilence_phones.txt\n";
85 while(<NS>) {
86 if (! s/\n$//) {
87 print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
88 set_to_fail();
89 }
90 my @row = split(" ", $_);
91 if (@row == 0) {
92 set_to_fail();
93 print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
94 }
95 foreach(0 .. @row-1) {
96 my $p = $row[$_];
97 if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
98 else {$nonsilence{$p} = 1;}
99 if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
100 set_to_fail();
101 print "--> ERROR: phone \"$p\" has disallowed written form";
103 }
104 }
105 $idx ++;
106 }
107 close(NS);
108 $success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
109 print "\n";
111 # Checking disjoint -------------------------------
112 sub intersect {
113 my ($a, $b) = @_;
114 @itset = ();
115 %itset = ();
116 foreach(keys %$a) {
117 if(exists $b->{$_} and !$itset{$_}) {
118 push(@itset, $_);
119 $itset{$_} = 1;
120 }
121 }
122 return @itset;
123 }
125 print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
126 @itset = intersect(\%silence, \%nonsilence);
127 if(@itset == 0) {print "--> disjoint property is OK.\n";}
128 else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
129 print "\n";
132 sub check_lexicon {
133 my ($lexfn, $pron_probs) = @_;
134 print "Checking $lexfn\n";
135 if(-z "$lexfn") {set_to_fail(); print "--> ERROR: $lexfn is empty or not exists\n";}
136 if(!open(L, "<$lexfn")) {set_to_fail(); print "--> ERROR: fail to open $lexfn\n";}
137 $idx = 1;
138 $success = 1;
139 print "--> reading $lexfn\n";
140 while (<L>) {
141 if (! s/\n$//) {
142 print "--> ERROR: last line '$_' of $lexfn does not end in newline.\n";
143 set_to_fail();
144 }
145 my @row = split(" ", $_);
146 $word = shift @row;
147 if (!defined $word) {
148 set_to_fail(); print "--> ERROR: empty lexicon line in $lexfn\n";
149 }
150 if ($pron_probs) {
151 $prob = shift @row;
152 if (!($prob > 0.0 && $prob <= 1.0)) {
153 set_to_fail(); print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
154 }
155 }
156 foreach (0 .. @row-1) {
157 if (!$silence{@row[$_]} and !$nonsilence{@row[$_]}) {
158 set_to_fail(); print "--> ERROR: phone \"@row[$_]\" is not in {, non}silence.txt (line $idx)\n";
159 }
160 }
161 $idx ++;
162 }
163 close(L);
164 $success == 0 || print "--> $lexfn is OK\n";
165 print "\n";
166 }
168 if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); }
169 if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); }
170 if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
171 print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
172 set_to_fail();
173 }
174 # If both lexicon.txt and lexiconp.txt exist, we check that they correspond to
175 # each other. If not, it could be that the user overwrote one and we need to
176 # regenerate the other, but we don't know which is which.
177 if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
178 print "Checking that lexicon.txt and lexiconp.txt match\n";
179 if (!open(L, "<$dict/lexicon.txt") || !open(P, "<$dict/lexiconp.txt")) {
180 die "Error opening lexicon.txt and/or lexiconp.txt"; # already checked, so would be code error.
181 }
182 while(<L>) {
183 if (! s/\n$//) {
184 print "--> ERROR: last line '$_' of $dict/lexicon.txt does not end in newline.\n";
185 set_to_fail();
186 last;
187 }
188 @A = split;
189 $x = <P>;
190 if ($x !~ s/\n$//) {
191 print "--> ERROR: last line '$x' of $dict/lexiconp.txt does not end in newline.\n";
192 set_to_fail();
193 last;
194 }
195 if (!defined $x) {
196 print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
197 set_to_fail();
198 last;
199 }
200 @B = split(" ", $x);
201 $w = shift @B;
202 $p = shift @B;
203 unshift @B, $w;
204 # now @A and @B should be the same.
205 if ($#A != $#B) {
206 print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
207 set_to_fail();
208 last;
209 }
210 for ($n = 0; $n < @A; $n++) {
211 if ($A[$n] ne $B[$n]) {
212 print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
213 set_to_fail();
214 last;
215 }
216 }
217 }
218 $x = <P>;
219 if (defined $x && $exit == 0) {
220 print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
221 set_to_fail();
222 }
223 }
225 # Checking extra_questions.txt -------------------------------
226 %distinguished = (); # Keep track of all phone-pairs including nonsilence that
227 # are distinguished (split apart) by extra_questions.txt,
228 # as $distinguished{$p1,$p2} = 1. This will be used to
229 # make sure that we don't have pairs of phones on the same
230 # line in nonsilence_phones.txt that can never be
231 # distinguished from each other by questions. (If any two
232 # phones appear on the same line in nonsilence_phones.txt,
233 # they share a tree root, and since the automatic
234 # question-building treats all phones that appear on the
235 # same line of nonsilence_phones.txt as being in the same
236 # group, we can never distinguish them without resorting to
237 # questions in extra_questions.txt.
238 print "Checking $dict/extra_questions.txt ...\n";
239 if (-s "$dict/extra_questions.txt") {
240 if (!open(EX, "<$dict/extra_questions.txt")) {
241 set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
242 }
243 $idx = 1;
244 $success = 1;
245 print "--> reading $dict/extra_questions.txt\n";
246 while(<EX>) {
247 if (! s/\n$//) {
248 print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
249 set_to_fail();
250 }
251 my @row = split(" ", $_);
252 if (@row == 0) {
253 set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n";
254 }
255 foreach (0 .. @row-1) {
256 if(!$silence{@row[$_]} and !$nonsilence{@row[$_]}) {
257 set_to_fail(); print "--> ERROR: phone \"@row[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
258 }
259 $idx ++;
260 }
261 %row_hash = ();
262 foreach $p (@row) { $row_hash{$p} = 1; }
263 foreach $p1 (@row) {
264 # Update %distinguished hash.
265 foreach $p2 (keys %nonsilence) {
266 if (!defined $row_hash{$p2}) { # for each p1 in this question and p2 not
267 # in this question (and in nonsilence
268 # phones)... mark p1,p2 as being split apart
269 $distinguished{$p1,$p2} = 1;
270 $distinguished{$p2,$p1} = 1;
271 }
272 }
273 }
274 }
275 close(EX);
276 $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
277 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
280 # check nonsilence_phones.txt again for phone-pairs that are never
281 # distnguishable. (note: this situation is normal and expected for silence
282 # phones, so we don't check it.)
283 if(!open(NS, "<$dict/nonsilence_phones.txt")) {
284 print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
285 }
287 $num_warn_nosplit = 0;
288 $num_warn_nosplit_limit = 10;
289 while(<NS>) {
290 my @row = split(" ", $_);
291 foreach $p1 (@row) {
292 foreach $p2 (@row) {
293 if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
294 set_to_fail();
295 if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
296 print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
297 }
298 if ($num_warn_nosplit == $num_warn_nosplit_limit) {
299 print "... Not warning any more times about this issue.\n";
300 }
301 if ($num_warn_nosplit == 0) {
302 print " (note: we started checking for this only recently. You can still build a system but\n";
303 print " phones $p1 and $p2 will be acoustically indistinguishable).\n";
304 }
305 $num_warn_nosplit++;
306 }
307 }
308 }
309 }
312 if ($exit == 1) { print "--> ERROR validating dictionary directory $dict (see detailed error messages above)\n"; exit 1;}
313 else { print "--> SUCCESS [validating dictionary directory $dict]\n"; }
315 exit 0;