1 #!/usr/bin/perl
3 # Apache 2.0.
4 # Guoguo Chen (guoguo@jhu.edu)
5 # Daniel Povey (dpovey@gmail.com)
6 #
7 # Validation script for data/local/dict
10 if(@ARGV != 1) {
11 die "Usage: validate_dict_dir.pl dict_directory\n";
12 }
14 $dict = shift @ARGV;
16 $exit = 0;
17 # Checking silence_phones.txt -------------------------------
18 print "Checking $dict/silence_phones.txt ...\n";
19 if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
20 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
21 $idx = 1;
22 %silence = ();
23 $success = 1;
24 print "--> reading $dict/silence_phones.txt\n";
25 while(<S>) {
26 if (! s/\n$//) {
27 die "Last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
28 }
29 my @col = split(" ", $_);
30 foreach(0 .. @col-1) {
31 my $p = $col[$_];
32 if($silence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; $success = 0;}
33 else {$silence{$p} = 1;}
34 if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
35 $exit = 1;
36 print "--> ERROR: phone \"$p\" has disallowed written form";
37 $success = 0;
38 }
39 }
40 $idx ++;
41 }
42 close(S);
43 $success == 0 || print "--> $dict/silence_phones.txt is OK\n";
44 print "\n";
46 # Checking optional_silence.txt -------------------------------
47 print "Checking $dict/optional_silence.txt ...\n";
48 if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
49 if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
50 $idx = 1;
51 $success = 1;
52 print "--> reading $dict/optional_silence.txt\n";
53 while(<OS>) {
54 chomp;
55 my @col = split(" ", $_);
56 if ($idx > 1 or @col > 1) {
57 $exit = 1; print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; $success = 0;
58 } elsif (!$silence{$col[0]}) {
59 $exit = 1; print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; $success = 0;
60 }
61 $idx ++;
62 }
63 close(OS);
64 $success == 0 || print "--> $dict/optional_silence.txt is OK\n";
65 print "\n";
67 # Checking nonsilence_phones.txt -------------------------------
68 print "Checking $dict/nonsilence_phones.txt ...\n";
69 if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
70 if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
71 $idx = 1;
72 %nonsilence = ();
73 $success = 1;
74 print "--> reading $dict/nonsilence_phones.txt\n";
75 while(<NS>) {
76 if (! s/\n$//) {
77 die "Last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
78 }
79 my @col = split(" ", $_);
80 foreach(0 .. @col-1) {
81 my $p = $col[$_];
82 if($nonsilence{$p}) {$exit = 1; print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; $success = 0;}
83 else {$nonsilence{$p} = 1;}
84 if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
85 $exit = 1;
86 print "--> ERROR: phone \"$p\" has disallowed written form";
87 $success = 0;
88 }
89 }
90 $idx ++;
91 }
92 close(NS);
93 $success == 0 || print "--> $dict/silence_phones.txt is OK\n";
94 print "\n";
96 # Checking disjoint -------------------------------
97 sub intersect {
98 my ($a, $b) = @_;
99 @itset = ();
100 %itset = ();
101 foreach(keys %$a) {
102 if(exists $b->{$_} and !$itset{$_}) {
103 push(@itset, $_);
104 $itset{$_} = 1;
105 }
106 }
107 return @itset;
108 }
110 print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
111 @itset = intersect(\%silence, \%nonsilence);
112 if(@itset == 0) {print "--> disjoint property is OK.\n";}
113 else {$exit = 1; print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
114 print "\n";
117 sub check_lexicon {
118 my ($lexfn, $pron_probs) = @_;
119 print "Checking $lexfn\n";
120 if(-z "$lexfn") {$exit = 1; print "--> ERROR: $lexfn is empty or not exists\n";}
121 if(!open(L, "<$lexfn")) {$exit = 1; print "--> ERROR: fail to open $lexfn\n";}
122 $idx = 1;
123 $success = 1;
124 print "--> reading $lexfn\n";
125 while (<L>) {
126 chomp;
127 my @col = split(" ", $_);
128 $word = shift @col;
129 if (!defined $word) {
130 $exit = 1; print "--> ERROR: empty lexicon line in $lexfn\n";
131 $success = 0;
132 }
133 if ($pron_probs) {
134 $prob = shift @col;
135 if (!($prob > 0.0 && $prob <= 1.0)) {
136 $exit = 1; print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lexfn\n";
137 $success = 0;
138 }
139 }
140 foreach (0 .. @col-1) {
141 if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
142 $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx)\n";
143 $success = 0;
144 }
145 }
146 $idx ++;
147 }
148 close(L);
149 $success == 0 || print "--> $lexfn is OK\n";
150 print "\n";
151 }
153 if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0); }
154 if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1); }
155 if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
156 print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
157 $exit = 1;
158 }
159 # If both lexicon.txt and lexiconp.txt exist, we check that they correspond to
160 # each other. If not, it could be that the user overwrote one and we need to
161 # regenerate the other, but we don't know which is which.
162 if ( (-f "$dict/lexicon.txt") && (-f "$dict/lexiconp.txt")) {
163 print "Checking that lexicon.txt and lexiconp.txt match\n";
164 if (!open(L, "<$dict/lexicon.txt") || !open(P, "<$dict/lexiconp.txt")) {
165 die "Error opening lexicon.txt and/or lexiconp.txt"; # already checked, so would be code error.
166 }
167 while(<L>) {
168 @A = split;
169 $x = <P>;
170 if (!defined $x) {
171 print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
172 $exit = 1;
173 last;
174 }
175 @B = split(" ", $x);
176 $w = shift @B;
177 $p = shift @B;
178 unshift @B, $w;
179 # now @A and @B should be the same.
180 if ($#A != $#B) {
181 print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
182 $exit = 1;
183 last;
184 }
185 for ($n = 0; $n < @A; $n++) {
186 if ($A[$n] ne $B[$n]) {
187 print "--> ERROR: lexicon.txt and lexiconp.txt have mismatched lines '$_' versus '$x'; delete one.\n";
188 $exit = 1;
189 last;
190 }
191 }
192 }
193 $x = <P>;
194 if (defined $x && $exit == 0) {
195 print "--> ERROR: lexicon.txt and lexiconp.txt have different numbers of lines (mismatch); delete one.\n";
196 $exit = 1;
197 }
198 }
200 # Checking extra_questions.txt -------------------------------
201 print "Checking $dict/extra_questions.txt ...\n";
202 if (-s "$dict/extra_questions.txt") {
203 if(!open(EX, "<$dict/extra_questions.txt")) {$exit = 1; print "--> ERROR: fail to open $dict/extra_questions.txt\n";}
204 $idx = 1;
205 $success = 1;
206 print "--> reading $dict/extra_questions.txt\n";
207 while(<EX>) {
208 chomp;
209 my @col = split(" ", $_);
210 foreach(0 .. @col-1) {
211 if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
212 $exit = 1; print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
213 $success = 0;
214 }
215 }
216 $idx ++;
217 }
218 close(EX);
219 $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
220 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
222 if($exit == 1) { print " [Error detected ]\n"; exit 1;}
224 exit 0;