summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 13e3a0b)
raw | patch | inline | side by side (parent: 13e3a0b)
author | Jan "yenda" Trmal <jtrmal@gmail.com> | |
Sat, 23 Sep 2017 02:20:37 +0000 (22:20 -0400) | ||
committer | GitHub <noreply@github.com> | |
Sat, 23 Sep 2017 02:20:37 +0000 (22:20 -0400) |
Relates to the scripts providing detailed error analysis after scoring.
egs/wsj/s5/utils/scoring/wer_ops_details.pl | patch | blob | history | |
egs/wsj/s5/utils/scoring/wer_per_spk_details.pl | patch | blob | history | |
egs/wsj/s5/utils/scoring/wer_per_utt_details.pl | patch | blob | history | |
egs/yesno/s5/local/score.sh | [changed from file to symlink] | patch | blob | history |
diff --git a/egs/wsj/s5/utils/scoring/wer_ops_details.pl b/egs/wsj/s5/utils/scoring/wer_ops_details.pl
index 269b31d45b4aab3fd7d5167f566b9059d617b34f..a34f4a0addc1b06c74e128f84b08614b31a8bf2b 100755 (executable)
use strict;
use warnings;
-use utf8;
-#use List::Util qw[max];
-use Data::Dumper;
use Getopt::Long;
use Pod::Usage;
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
my $help;
my $special_symbol= "<eps>";
my $separator=";";
my $extra_size=4;
my $max_size=16;
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
+ return 0;
+}
sub print_line {
my $op = $_[0];
my $rewf = $_[1];
my %EDIT_OPS;
my %UTT;
-while (<STDIN>) {
- chomp;
- my @entries = split(" ", $_);
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ my @entries = split(" ", $line);
next if @entries < 2;
next if ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
if (scalar @entries <= 2 ) {
diff --git a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
index e37c0cf673b270a8870ec2393cb5a213d0b48a81..217448e9fb0e64071cffe96d6a3b5864d1fb654b 100755 (executable)
use strict;
use warnings;
-use utf8;
use List::Util qw[max];
use Getopt::Long;
use Pod::Usage;
-use open qw(:std :encoding(UTF-8));
#use Data::Dumper;
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
my $WIDTH=10;
my $SPK_WIDTH=15;
my $help;
my %UTTMAP;
my %PERSPK_STATS;
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
+
+ return 0;
+}
+
sub print_header {
my $f="%${WIDTH}s";
}
close(UTT2SPK);
-while (<STDIN>) {
- chomp;
- my @entries = split(" ", $_);
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ my @entries = split(" ", $line);
next if @entries < 2;
next if $entries[1] ne "#csid" ;
die "Incompatible entry $_ " if @entries != 6;
diff --git a/egs/wsj/s5/utils/scoring/wer_per_utt_details.pl b/egs/wsj/s5/utils/scoring/wer_per_utt_details.pl
index 57afa16e30c0d4df63f82dcdd25eb6e5b438b014..48452d51f49e564ed1f4aa4dc6ecae12949c093d 100755 (executable)
#
use strict;
use warnings;
-use utf8;
use List::Util qw[max];
use Getopt::Long;
use Pod::Usage;
#use Data::Dumper;
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
my $special_symbol= "<eps>";
my $separator=";";
my $output_hyp = 1;
return sprintf("%s%s%s", " " x $left_spaces, $str, " " x $right_spaces);
}
-while (<STDIN>) {
- chomp;
- (my $utt_id, my $alignment) = split (" ", $_, 2);
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to
+# make sure the length of the (decoded) string
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
+}
+
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ (my $utt_id, my $alignment) = split (" ", $line, 2);
my @alignment_pairs = split(" ", $alignment); #splits on spaces, does not create empty fields
my @HYP;
deleted file mode 100755 (executable)
index 518e14d67e4888038870019b6268b4727afbff45..0000000000000000000000000000000000000000
index 518e14d67e4888038870019b6268b4727afbff45..0000000000000000000000000000000000000000
+++ /dev/null
-#!/bin/bash
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-word_ins_penalty=0.0
-min_lmwt=7
-max_lmwt=11
-#end configuration section.
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
- echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
- echo " Options:"
- echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
- echo " --stage (0|1|2) # start scoring script from part-way through."
- echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
- echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
- echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
- exit 1;
-fi
-
-data=$1
-lang_or_graph=$2
-dir=$3
-
-symtab=$lang_or_graph/words.txt
-
-for f in $symtab $dir/lat.1.gz $data/text; do
- [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
-done
-
-mkdir -p $dir/scoring/log
-
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
- lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
- lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
- lattice-best-path --word-symbol-table=$symtab \
- ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
-
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
- cat $dir/scoring/LMWT.tra \| \
- utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
- compute-wer --text --mode=present \
- ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1;
-
-exit 0;
new file mode 120000 (symlink)
index 0000000000000000000000000000000000000000..0afefc3158c9e4a5b2cbec83b61519a89690da00
index 0000000000000000000000000000000000000000..0afefc3158c9e4a5b2cbec83b61519a89690da00
--- /dev/null
+../steps/score_kaldi.sh
\ No newline at end of file