aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Smit2017-09-26 23:10:33 -0500
committerDaniel Povey2017-09-26 23:10:33 -0500
commit906006f5a3ee267255242784c678079d00d55e09 (patch)
tree11f29a95dd4beb6e91b77057d1f8d5aa6b5606ec
parentb0cc15790f0bab973cf33a71ac169d8e15d453c8 (diff)
downloadkaldi-906006f5a3ee267255242784c678079d00d55e09.tar.gz
kaldi-906006f5a3ee267255242784c678079d00d55e09.tar.xz
kaldi-906006f5a3ee267255242784c678079d00d55e09.zip
[scripts] Fixes to data-cleanup scripts (#1902)
-rwxr-xr-xegs/wsj/s5/steps/cleanup/clean_and_segment_data.sh3
-rwxr-xr-xegs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py5
-rwxr-xr-xegs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py5
3 files changed, 11 insertions, 2 deletions
diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
index a523de30e..670e6c2b7 100755
--- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
@@ -192,6 +192,9 @@ if [ $stage -le 8 ]; then
192 echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out" 192 echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out"
193 padding=$(cat $dir/segment_end_padding) # e.g. 0.02 193 padding=$(cat $dir/segment_end_padding) # e.g. 0.02
194 utils/data/subsegment_data_dir.sh --segment-end-padding $padding ${data} $dir/segments $dir/text $data_out 194 utils/data/subsegment_data_dir.sh --segment-end-padding $padding ${data} $dir/segments $dir/text $data_out
195 # utils/data/subsegment_data_dir.sh can output directories that have e.g. to many entries left in wav.scp
196 # Clean this up with the fix_dat_dir.sh script
197 utils/fix_data_dir.sh $data_out
195fi 198fi
196 199
197if [ $stage -le 9 ]; then 200if [ $stage -le 9 ]; then
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
index b6c7e8a7c..39f6d38d6 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -807,10 +807,13 @@ def TimeToString(time, frame_length):
807 807
808def WriteSegmentsForUtterance(text_output_handle, segments_output_handle, 808def WriteSegmentsForUtterance(text_output_handle, segments_output_handle,
809 old_utterance_name, segments): 809 old_utterance_name, segments):
810 num_digits = len(str(len(segments)))
810 for n in range(len(segments)): 811 for n in range(len(segments)):
811 segment = segments[n] 812 segment = segments[n]
812 # split utterances will be named foo-bar-1 foo-bar-2, etc. 813 # split utterances will be named foo-bar-1 foo-bar-2, etc.
813 new_utterance_name = old_utterance_name + "-" + str(n + 1) 814 new_utterance_name = "{old}-{index:0{width}}".format(
815 old=old_utterance_name, index=n+1,
816 width=num_digits)
814 # print a line to the text output of the form like 817 # print a line to the text output of the form like
815 # <new-utterance-id> <text> 818 # <new-utterance-id> <text>
816 # like: 819 # like:
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
index 35b9ed605..46a9369ae 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
@@ -1769,9 +1769,12 @@ def time_to_string(time, frame_length):
1769def write_segments_for_utterance(text_output_handle, segments_output_handle, 1769def write_segments_for_utterance(text_output_handle, segments_output_handle,
1770 old_utterance_name, segments, oov_symbol, 1770 old_utterance_name, segments, oov_symbol,
1771 eps_symbol="<eps>", frame_length=0.01): 1771 eps_symbol="<eps>", frame_length=0.01):
1772 num_digits = len(str(len(segments)))
1772 for n, segment in enumerate(segments): 1773 for n, segment in enumerate(segments):
1773 # split utterances will be named foo-bar-1 foo-bar-2, etc. 1774 # split utterances will be named foo-bar-1 foo-bar-2, etc.
1774 new_utterance_name = old_utterance_name + "-" + str(n + 1) 1775 new_utterance_name = "{old}-{index:0{width}}".format(
1776 old=old_utterance_name, index=n+1,
1777 width=num_digits)
1775 # print a line to the text output of the form like 1778 # print a line to the text output of the form like
1776 # <new-utterance-id> <text> 1779 # <new-utterance-id> <text>
1777 # like: 1780 # like: