summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: aedc2fe)
raw | patch | inline | side by side (parent: aedc2fe)
author | Xingyu Na <asr.naxingyu@gmail.com> | |
Wed, 19 Jul 2017 17:05:50 +0000 (01:05 +0800) | ||
committer | Daniel Povey <dpovey@gmail.com> | |
Wed, 19 Jul 2017 17:05:50 +0000 (13:05 -0400) |
egs/aishell/s5/local/aishell_data_prep.sh | patch | blob | history | |
egs/aishell/s5/local/download_and_untar.sh | patch | blob | history | |
egs/aishell/s5/run.sh | patch | blob | history |
index 052dc9f9b477f8d6e60b55bed8927c3764fb35f1..4747e4f4d82ca937d93b1931bc058e879cbaaad1 100755 (executable)
# find wav audio file for train, dev and test resp.
find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
-n=`wc -l $tmp_dir/wav.flist`
+n=`cat $tmp_dir/wav.flist | wc -l`
[ $n -ne 141925 ] && \
echo Warning: expected 141925 data data files, found $n
# Transcriptions preparation
for dir in $train_dir $dev_dir $test_dir; do
echo Preparing $dir transcriptions
- sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' |\
- sort > $dir/utt.list
- sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' |\
- sort > $dir/utt2spk_all
+ sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+ sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
diff --git a/egs/aishell/s5/local/download_and_untar.sh b/egs/aishell/s5/local/download_and_untar.sh
index 0189bad1d4a5c0c25c6393cd003748906247fa16..3578a1c0835198b4a89c0968a622f301533d68b5 100755 (executable)
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
- echo "<corpus-part> can be one of: data_aishell, resource."
+ echo "<corpus-part> can be one of: data_aishell, resource_aishell."
fi
data=$1
fi
part_ok=false
-list="data_aishell resource"
+list="data_aishell resource_aishell"
for x in $list; do
if [ "$part" == $x ]; then part_ok=true; fi
done
diff --git a/egs/aishell/s5/run.sh b/egs/aishell/s5/run.sh
index 2c8207ec745a397eb661b5ab92ed1ed8d1374138..a99cb51c6565b124e0132054cd1ce96a60a22680 100755 (executable)
--- a/egs/aishell/s5/run.sh
+++ b/egs/aishell/s5/run.sh
data=/export/a05/xna/data
data_url=www.openslr.org/resources/33
-. cmd.sh
+. ./cmd.sh
local/download_and_untar.sh $data $data_url data_aishell || exit 1;
-local/download_and_untar.sh $data $data_url resource || exit 1;
+local/download_and_untar.sh $data $data_url resource_aishell || exit 1;
# Lexicon Preparation,
local/aishell_prepare_dict.sh $data/resource_aishell || exit 1;