aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan "yenda" Trmal2017-09-27 23:10:42 -0500
committerGitHub2017-09-27 23:10:42 -0500
commit6cab750e87fa8affd51ef96b244bf6d06e37ac76 (patch)
tree79c856a462e7470d1e49d0796474f715c61fa070
parentba00b18c290f4ccaba92aba11e45ac7da2d96396 (diff)
downloadkaldi-6cab750e87fa8affd51ef96b244bf6d06e37ac76.tar.gz
kaldi-6cab750e87fa8affd51ef96b244bf6d06e37ac76.tar.xz
kaldi-6cab750e87fa8affd51ef96b244bf6d06e37ac76.zip
Modify data-validation script and dictionary-validation script to disallow exotic space characters (#1910)
* validate_lang checks for incompatible UTF-8 whitespaces * adding validate_dict_dir as well * include utf-8 whitespaces validation for data/<name>/text files * fix perl syntax error
-rwxr-xr-xegs/wsj/s5/utils/validate_data_dir.sh1
-rwxr-xr-xegs/wsj/s5/utils/validate_dict_dir.pl93
-rwxr-xr-xegs/wsj/s5/utils/validate_lang.pl87
-rwxr-xr-xegs/wsj/s5/utils/validate_text.pl131
4 files changed, 310 insertions, 2 deletions
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 92ba8f8b6..6b82b4333 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -108,6 +108,7 @@ fi
108 108
109num_utts=`cat $tmpdir/utts | wc -l` 109num_utts=`cat $tmpdir/utts | wc -l`
110if [ -f $data/text ]; then 110if [ -f $data/text ]; then
111 utils/validate_text.pl $data/text || exit 1;
111 check_sorted_and_uniq $data/text 112 check_sorted_and_uniq $data/text
112 text_len=`cat $data/text | wc -l` 113 text_len=`cat $data/text | wc -l`
113 illegal_sym_list="<s> </s> #0" 114 illegal_sym_list="<s> </s> #0"
diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl
index a5c9ff8da..25e45da97 100755
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@@ -1,11 +1,95 @@
1#!/usr/bin/env perl 1#!/usr/bin/env perl
2 2
3# Apache 2.0. 3# Apache 2.0.
4# Guoguo Chen (guoguo@jhu.edu) 4# Copyright 2012 Guoguo Chen
5# Daniel Povey (dpovey@gmail.com) 5# 2015 Daniel Povey
6# 2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
6# 7#
7# Validation script for data/local/dict 8# Validation script for data/local/dict
8 9
10# this function reads the opened file (supplied as a first
11# parameter) into an array of lines. For each
12# line, it tests whether it's a valid utf-8 compatible
13# line. If all lines are valid utf-8, it returns the lines
14# decoded as utf-8, otherwise it assumes the file's encoding
15# is one of those 1-byte encodings, such as ISO-8859-x
16# or Windows CP-X.
17# Please recall we do not really care about
18# the actually encoding, we just need to
19# make sure the length of the (decoded) string
20# is correct (to make the output formatting looking right).
21sub get_utf8_or_bytestream {
22 use Encode qw(decode encode);
23 my $is_utf_compatible = 1;
24 my @unicode_lines;
25 my @raw_lines;
26 my $raw_text;
27 my $lineno = 0;
28 my $file = shift;
29
30 while (<$file>) {
31 $raw_text = $_;
32 last unless $raw_text;
33 if ($is_utf_compatible) {
34 my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
35 $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
36 push @unicode_lines, $decoded_text;
37 } else {
38 #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
39 ;
40 }
41 push @raw_lines, $raw_text;
42 $lineno += 1;
43 }
44
45 if (!$is_utf_compatible) {
46 return (0, @raw_lines);
47 } else {
48 return (1, @unicode_lines);
49 }
50}
51
52# check if the given unicode string contain unicode whitespaces
53# other than the usual four: TAB, LF, CR and SPACE
54sub validate_utf8_whitespaces {
55 my $unicode_lines = shift;
56 use feature 'unicode_strings';
57 for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
58 my $current_line = $unicode_lines->[$i];
59 # we replace TAB, LF, CR, and SPACE
60 # this is to simplify the test
61 $current_line =~ s/[\x{0009}\x{000a}\x{000d}\x{0020}]/./g;
62 if ($current_line =~/\s/) {
63 return 1;
64 }
65 }
66 return 0;
67}
68
69# checks if the text in the file (supplied as the argument) is utf-8 compatible
70# if yes, checks if it contains only allowed whitespaces. If no, then does not
71# do anything. The function seeks to the original position in the file after
72# reading the text.
73sub check_allowed_whitespace {
74 my $file = shift;
75 my $pos = tell($file);
76 (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
77 seek($file, $pos, SEEK_SET);
78 if ($is_utf) {
79 my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
80 print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
81 if ($has_invalid_whitespaces) {
82 print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
83 return 0;
84 } else {
85 print "--> text contains only allowed whitespaces\n";
86 }
87 } else {
88 print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
89 }
90 return 1;
91}
92
9 93
10if(@ARGV != 1) { 94if(@ARGV != 1) {
11 die "Usage: validate_dict_dir.pl <dict-dir>\n" . 95 die "Usage: validate_dict_dir.pl <dict-dir>\n" .
@@ -29,6 +113,7 @@ $idx = 1;
29$crlf = 1; 113$crlf = 1;
30 114
31print "--> reading $dict/silence_phones.txt\n"; 115print "--> reading $dict/silence_phones.txt\n";
116check_allowed_whitespace(\*S) || set_to_fail();
32while(<S>) { 117while(<S>) {
33 if (! s/\n$//) { 118 if (! s/\n$//) {
34 print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; 119 print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
@@ -73,6 +158,7 @@ $idx = 1;
73$success = 1; 158$success = 1;
74$crlf = 1; 159$crlf = 1;
75print "--> reading $dict/optional_silence.txt\n"; 160print "--> reading $dict/optional_silence.txt\n";
161check_allowed_whitespace(\*OS) or exit 1;
76while(<OS>) { 162while(<OS>) {
77 chomp; 163 chomp;
78 my @col = split(" ", $_); 164 my @col = split(" ", $_);
@@ -101,6 +187,7 @@ $idx = 1;
101$success = 1; 187$success = 1;
102$crlf = 1; 188$crlf = 1;
103print "--> reading $dict/nonsilence_phones.txt\n"; 189print "--> reading $dict/nonsilence_phones.txt\n";
190check_allowed_whitespace(\*NS) or set_to_fail();
104while(<NS>) { 191while(<NS>) {
105 if ($crlf == 1 && m/\r/) { 192 if ($crlf == 1 && m/\r/) {
106 print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; 193 print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
@@ -166,6 +253,7 @@ sub check_lexicon {
166 my %seen_line = {}; 253 my %seen_line = {};
167 $idx = 1; $success = 1; $crlf = 1; 254 $idx = 1; $success = 1; $crlf = 1;
168 print "--> reading $lex\n"; 255 print "--> reading $lex\n";
256 check_allowed_whitespace(\*L) or set_to_fail();
169 while (<L>) { 257 while (<L>) {
170 if ($crlf == 1 && m/\r/) { 258 if ($crlf == 1 && m/\r/) {
171 print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; 259 print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
@@ -333,6 +421,7 @@ if (-s "$dict/extra_questions.txt") {
333 $success = 1; 421 $success = 1;
334 $crlf = 1; 422 $crlf = 1;
335 print "--> reading $dict/extra_questions.txt\n"; 423 print "--> reading $dict/extra_questions.txt\n";
424 check_allowed_whitespace(\*EX) or set_to_fail();
336 while(<EX>) { 425 while(<EX>) {
337 if ($crlf == 1 && m/\r/) { 426 if ($crlf == 1 && m/\r/) {
338 print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; 427 print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 389fe6d34..7e95545b2 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -3,9 +3,92 @@
3# Apache 2.0. 3# Apache 2.0.
4# Copyright 2012 Guoguo Chen 4# Copyright 2012 Guoguo Chen
5# 2014 Neil Nelson 5# 2014 Neil Nelson
6# 2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
6# 7#
7# Validation script for data/lang 8# Validation script for data/lang
8 9
10# this function reads the opened file (supplied as a first
11# parameter) into an array of lines. For each
12# line, it tests whether it's a valid utf-8 compatible
13# line. If all lines are valid utf-8, it returns the lines
14# decoded as utf-8, otherwise it assumes the file's encoding
15# is one of those 1-byte encodings, such as ISO-8859-x
16# or Windows CP-X.
17# Please recall we do not really care about
18# the actually encoding, we just need to
19# make sure the length of the (decoded) string
20# is correct (to make the output formatting looking right).
21sub get_utf8_or_bytestream {
22 use Encode qw(decode encode);
23 my $is_utf_compatible = 1;
24 my @unicode_lines;
25 my @raw_lines;
26 my $raw_text;
27 my $lineno = 0;
28 my $file = shift;
29
30 while (<$file>) {
31 $raw_text = $_;
32 last unless $raw_text;
33 if ($is_utf_compatible) {
34 my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
35 $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
36 push @unicode_lines, $decoded_text;
37 } else {
38 #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
39 ;
40 }
41 push @raw_lines, $raw_text;
42 $lineno += 1;
43 }
44
45 if (!$is_utf_compatible) {
46 return (0, @raw_lines);
47 } else {
48 return (1, @unicode_lines);
49 }
50}
51
52# check if the given unicode string contain unicode whitespaces
53# other than the usual four: TAB, LF, CR and SPACE
54sub validate_utf8_whitespaces {
55 my $unicode_lines = shift;
56 use feature 'unicode_strings';
57 for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
58 my $current_line = $unicode_lines->[$i];
59 # we replace TAB, LF, CR, and SPACE
60 # this is to simplify the test
61 $current_line =~ s/[\x{0009}\x{000a}\x{000d}\x{0020}]/./g;
62 if ($current_line =~/\s/) {
63 return 1;
64 }
65 }
66 return 0;
67}
68
69# checks if the text in the file (supplied as the argument) is utf-8 compatible
70# if yes, checks if it contains only allowed whitespaces. If no, then does not
71# do anything. The function seeks to the original position in the file after
72# reading the text.
73sub check_allowed_whitespace {
74 my $file = shift;
75 my $pos = tell($file);
76 (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
77 seek($file, $pos, SEEK_SET);
78 if ($is_utf) {
79 my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
80 print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
81 if ($has_invalid_whitespaces) {
82 print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
83 return 0;
84 } else {
85 print "--> text contains only allowed whitespaces\n";
86 }
87 } else {
88 print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
89 }
90 return 1;
91}
9 92
10$skip_det_check = 0; 93$skip_det_check = 0;
11$skip_disambig_check = 0; 94$skip_disambig_check = 0;
@@ -44,6 +127,7 @@ if (!open(P, "<$lang/phones.txt")) {
44} 127}
45$idx = 1; 128$idx = 1;
46%psymtab = (); 129%psymtab = ();
130check_allowed_whitespace(\*P) or exit 1;
47while (<P>) { 131while (<P>) {
48 chomp; 132 chomp;
49 my @col = split(" ", $_); 133 my @col = split(" ", $_);
@@ -77,6 +161,7 @@ if (!open(W, "<$lang/words.txt")) {
77} 161}
78$idx = 1; 162$idx = 1;
79%wsymtab = (); 163%wsymtab = ();
164check_allowed_whitespace(\*W) or exit 1;
80while (<W>) { 165while (<W>) {
81 chomp; 166 chomp;
82 my @col = split(" ", $_); 167 my @col = split(" ", $_);
@@ -124,6 +209,7 @@ sub check_txt_int_csl {
124 } 209 }
125 210
126 $idx1 = 1; 211 $idx1 = 1;
212 check_allowed_whitespace(\*TXT) or $exit = 1;
127 while (<TXT>) { 213 while (<TXT>) {
128 chomp; 214 chomp;
129 my @col = split(" ", $_); 215 my @col = split(" ", $_);
@@ -202,6 +288,7 @@ sub check_txt_int {
202 } 288 }
203 289
204 $idx1 = 1; 290 $idx1 = 1;
291 check_allowed_whitespace(\*TXT) or $exit = 1;
205 while (<TXT>) { 292 while (<TXT>) {
206 chomp; 293 chomp;
207 s/^(shared|not-shared) (split|not-split) //g; 294 s/^(shared|not-shared) (split|not-split) //g;
diff --git a/egs/wsj/s5/utils/validate_text.pl b/egs/wsj/s5/utils/validate_text.pl
new file mode 100755
index 000000000..9f8c8df1d
--- /dev/null
+++ b/egs/wsj/s5/utils/validate_text.pl
@@ -0,0 +1,131 @@
1#!/usr/bin/env perl
2#===============================================================================
3# Copyright 2017 (Author: Yenda Trmal <jtrmal@gmail.com>)
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
13# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
14# MERCHANTABLITY OR NON-INFRINGEMENT.
15# See the Apache 2 License for the specific language governing permissions and
16# limitations under the License.
17#===============================================================================
18
19# validation script for data/<dataset>/text
20# to be called (preferably) from utils/validate_data_dir.sh
21use strict;
22use warnings;
23use utf8;
24use Fcntl qw< SEEK_SET >;
25
26# this function reads the opened file (supplied as a first
27# parameter) into an array of lines. For each
28# line, it tests whether it's a valid utf-8 compatible
29# line. If all lines are valid utf-8, it returns the lines
30# decoded as utf-8, otherwise it assumes the file's encoding
31# is one of those 1-byte encodings, such as ISO-8859-x
32# or Windows CP-X.
33# Please recall we do not really care about
34# the actually encoding, we just need to
35# make sure the length of the (decoded) string
36# is correct (to make the output formatting looking right).
37sub get_utf8_or_bytestream {
38 use Encode qw(decode encode);
39 my $is_utf_compatible = 1;
40 my @unicode_lines;
41 my @raw_lines;
42 my $raw_text;
43 my $lineno = 0;
44 my $file = shift;
45
46 while (<$file>) {
47 $raw_text = $_;
48 last unless $raw_text;
49 if ($is_utf_compatible) {
50 my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
51 $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
52 push @unicode_lines, $decoded_text;
53 } else {
54 #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
55 ;
56 }
57 push @raw_lines, $raw_text;
58 $lineno += 1;
59 }
60
61 if (!$is_utf_compatible) {
62 return (0, @raw_lines);
63 } else {
64 return (1, @unicode_lines);
65 }
66}
67
68# check if the given unicode string contain unicode whitespaces
69# other than the usual four: TAB, LF, CR and SPACE
70sub validate_utf8_whitespaces {
71 my $unicode_lines = shift;
72 use feature 'unicode_strings';
73 for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
74 my $current_line = $unicode_lines->[$i];
75 # we replace TAB, LF, CR, and SPACE
76 # this is to simplify the test
77 $current_line =~ s/[\x{0009}\x{000a}\x{000d}\x{0020}]/./g;
78 if ($current_line =~/\s/) {
79 return 1;
80 }
81 }
82 return 0;
83}
84
85# checks if the text in the file (supplied as the argument) is utf-8 compatible
86# if yes, checks if it contains only allowed whitespaces. If no, then does not
87# do anything. The function seeks to the original position in the file after
88# reading the text.
89sub check_allowed_whitespace {
90 my $file = shift;
91 my $pos = tell($file);
92 (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
93 seek($file, $pos, SEEK_SET);
94 if ($is_utf) {
95 my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
96 print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
97 if ($has_invalid_whitespaces) {
98 print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
99 return 0;
100 } else {
101 print "--> text contains only allowed whitespaces\n";
102 }
103 } else {
104 print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
105 }
106 return 1;
107}
108if(@ARGV != 1) {
109 die "Usage: validate_text.pl <text-file>\n" .
110 "e.g.: validate_text.pl data/train/text\n";
111}
112
113my $text = shift @ARGV;
114
115# Checking optional_silence.txt -------------------------------
116print "Checking $text ...\n";
117if(-z "$text") {
118 print "--> ERROR: $text is empty or not exists\n";
119 exit 1;
120}
121
122if(!open(FILE, "<$text")) {
123 print "--> ERROR: fail to open $text\n";
124 exit 1;
125}
126
127print "--> reading $text\n";
128check_allowed_whitespace(\*FILE) or exit 1;
129close(FILE);
130
131