index 9780de57f2ebf5c282b7d71fef85eb443e9167b4..a5c9ff8da21b519bc3aac69a875f6f88d6095304 100755 (executable)
if(@ARGV != 1) {
- die "Usage: validate_dict_dir.pl dict_directory\n";
+ die "Usage: validate_dict_dir.pl <dict-dir>\n" .
+ "e.g.: validate_dict_dir.pl data/local/dict\n";
}
$dict = shift @ARGV;
if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
$idx = 1;
%silence = ();
+$crlf = 1;
print "--> reading $dict/silence_phones.txt\n";
while(<S>) {
print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
set_to_fail();
}
+ if ($crlf == 1 && m/\r/) {
+ print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
+ set_to_fail();
+ $crlf = 0;
+ }
my @col = split(" ", $_);
if (@col == 0) {
set_to_fail();
}
foreach(0 .. @col-1) {
my $p = $col[$_];
- if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
- else {$silence{$p} = 1;}
- if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
+ if($silence{$p}) {
+ set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n";
+ } else {
+ $silence{$p} = 1;
+ }
+ # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
+ # problems with word-position-dependent systems, and <eps> is obviously
+ # confusable with epsilon.
+ if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form\n";
-
}
}
$idx ++;
if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
$idx = 1;
$success = 1;
+$crlf = 1;
print "--> reading $dict/optional_silence.txt\n";
while(<OS>) {
chomp;
} elsif (!$silence{$col[0]}) {
set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
}
+ if ($crlf == 1 && m/\r/) {
+ print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
+ set_to_fail();
+ $crlf = 0;
+ }
$idx ++;
}
close(OS);
$idx = 1;
%nonsilence = ();
$success = 1;
+$crlf = 1;
print "--> reading $dict/nonsilence_phones.txt\n";
while(<NS>) {
+ if ($crlf == 1 && m/\r/) {
+ print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
+ set_to_fail();
+ $crlf = 0;
+ }
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
set_to_fail();
}
foreach(0 .. @col-1) {
my $p = $col[$_];
- if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
- else {$nonsilence{$p} = 1;}
- if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
+ if($nonsilence{$p}) {
+ set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n";
+ } else {
+ $nonsilence{$p} = 1;
+ }
+ # phones that start with the pound sign/hash may be mistaken for
+ # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
+ # problems with word-position-dependent systems, and <eps> is obviously
+ # confusable with epsilon.
+ if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
set_to_fail();
print "--> ERROR: phone \"$p\" has disallowed written form\n";
-
}
}
$idx ++;
print "Checking $lex\n";
!open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
my %seen_line = {};
- $idx = 1; $success = 1;
+ $idx = 1; $success = 1; $crlf = 1;
print "--> reading $lex\n";
while (<L>) {
+ if ($crlf == 1 && m/\r/) {
+ print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
+ set_to_fail();
+ $crlf = 0;
+ }
if (defined $seen_line{$_}) {
print "--> ERROR: line '$_' of $lex is repeated\n";
set_to_fail();
if (!defined $word) {
print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
}
- if ($word eq "<s>" || $word eq "</s>") {
+ if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
print "--> ERROR: lexicon.txt contains forbidden word $word\n";
set_to_fail();
}
if (-f "$dict/silprob.txt") {
!open(SP, "<$dict/silprob.txt") &&
print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
+ $crlf = 1;
while (<SP>) {
+ if ($crlf == 1 && m/\r/) {
+ print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
+ set_to_fail();
+ $crlf = 0;
+ }
chomp; my @col = split;
@col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
if ($col[0] eq "<s>" || $col[0] eq "overall") {
}
$idx = 1;
$success = 1;
+ $crlf = 1;
print "--> reading $dict/extra_questions.txt\n";
while(<EX>) {
+ if ($crlf == 1 && m/\r/) {
+ print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
+ set_to_fail();
+ $crlf = 0;
+ }
if (! s/\n$//) {
print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
set_to_fail();