diff --git a/egs/wsj/s5/utils/scoring/wer_ops_details.pl b/egs/wsj/s5/utils/scoring/wer_ops_details.pl
index 269b31d45b4aab3fd7d5167f566b9059d617b34f..a34f4a0addc1b06c74e128f84b08614b31a8bf2b 100755 (executable)
use strict;
use warnings;
-use utf8;
-#use List::Util qw[max];
-use Data::Dumper;
use Getopt::Long;
use Pod::Usage;
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
my $help;
my $special_symbol= "<eps>";
my $separator=";";
my $extra_size=4;
my $max_size=16;
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
+ return 0;
+}
sub print_line {
my $op = $_[0];
my $rewf = $_[1];
my %EDIT_OPS;
my %UTT;
-while (<STDIN>) {
- chomp;
- my @entries = split(" ", $_);
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ my @entries = split(" ", $line);
next if @entries < 2;
next if ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
if (scalar @entries <= 2 ) {