aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordavid-ryan-snyder2017-10-03 17:00:45 -0500
committerDaniel Povey2017-10-03 17:00:45 -0500
commite082c17d4a8f8a791428ae4d9f7ceb776aef3f0b (patch)
tree0bfbfedae99736f8ed1d9cf20500a5d217ecfc39
parentf348b26c1fccd3ba71837826cfcbb9ee6a9ce6ee (diff)
downloadkaldi-e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b.tar.gz
kaldi-e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b.tar.xz
kaldi-e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b.zip
[src,scripts,egs] Xvectors: DNN Embeddings for Speaker Recognition (#1896)
-rwxr-xr-xegs/lre07/v1/run.sh2
-rwxr-xr-xegs/sre08/v1/sid/nnet3/xvector/allocate_egs.py325
-rwxr-xr-xegs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh102
-rwxr-xr-xegs/sre08/v1/sid/nnet3/xvector/get_egs.sh247
-rw-r--r--egs/sre16/README.txt20
-rw-r--r--egs/sre16/v1/README.txt29
-rwxr-xr-xegs/sre16/v1/cmd.sh15
-rw-r--r--egs/sre16/v1/conf/mfcc.conf6
-rw-r--r--egs/sre16/v1/conf/vad.conf2
-rwxr-xr-xegs/sre16/v1/local/make_musan.py95
-rwxr-xr-xegs/sre16/v1/local/make_musan.sh37
-rwxr-xr-xegs/sre16/v1/local/make_mx6.sh41
-rwxr-xr-xegs/sre16/v1/local/make_mx6_calls.pl105
-rwxr-xr-xegs/sre16/v1/local/make_mx6_mic.pl92
-rwxr-xr-xegs/sre16/v1/local/make_sre.pl75
-rwxr-xr-xegs/sre16/v1/local/make_sre.sh34
-rwxr-xr-xegs/sre16/v1/local/make_sre08.pl131
-rwxr-xr-xegs/sre16/v1/local/make_sre10.pl133
-rwxr-xr-xegs/sre16/v1/local/make_sre16_eval.pl154
-rwxr-xr-xegs/sre16/v1/local/make_sre16_unlabeled.pl90
-rwxr-xr-xegs/sre16/v1/local/make_swbd2_phase1.pl106
-rwxr-xr-xegs/sre16/v1/local/make_swbd2_phase2.pl107
-rwxr-xr-xegs/sre16/v1/local/make_swbd2_phase3.pl102
-rwxr-xr-xegs/sre16/v1/local/make_swbd_cellular1.pl83
-rwxr-xr-xegs/sre16/v1/local/make_swbd_cellular2.pl83
-rwxr-xr-xegs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh70
l---------egs/sre16/v1/local/nnet3/xvector/run_xvector.sh1
-rwxr-xr-xegs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh152
-rwxr-xr-xegs/sre16/v1/path.sh5
-rwxr-xr-xegs/sre16/v1/run.sh289
l---------egs/sre16/v1/sid1
l---------egs/sre16/v1/steps1
l---------egs/sre16/v1/utils1
-rw-r--r--egs/sre16/v2/README.txt30
-rwxr-xr-xegs/sre16/v2/cmd.sh15
-rw-r--r--egs/sre16/v2/conf/mfcc.conf6
-rw-r--r--egs/sre16/v2/conf/vad.conf4
l---------egs/sre16/v2/local1
-rwxr-xr-xegs/sre16/v2/path.sh5
-rwxr-xr-xegs/sre16/v2/run.sh320
l---------egs/sre16/v2/sid1
l---------egs/sre16/v2/steps1
l---------egs/sre16/v2/utils1
-rwxr-xr-xegs/wsj/s5/steps/data/augment_data_dir.py194
-rwxr-xr-xegs/wsj/s5/steps/data/reverberate_data_dir.py43
-rw-r--r--egs/wsj/s5/steps/libs/nnet3/xconfig/stats_layer.py11
-rwxr-xr-xegs/wsj/s5/utils/combine_data.sh2
-rwxr-xr-xegs/wsj/s5/utils/copy_data_dir.sh4
-rw-r--r--src/makefiles/default_rules.mk4
-rw-r--r--src/nnet3bin/Makefile2
-rw-r--r--src/nnet3bin/nnet3-xvector-compute.cc211
-rw-r--r--src/nnet3bin/nnet3-xvector-get-egs.cc229
52 files changed, 3789 insertions, 31 deletions
diff --git a/egs/lre07/v1/run.sh b/egs/lre07/v1/run.sh
index 8664494e5..87f518e63 100755
--- a/egs/lre07/v1/run.sh
+++ b/egs/lre07/v1/run.sh
@@ -13,8 +13,8 @@ set -e
13mfccdir=`pwd`/mfcc 13mfccdir=`pwd`/mfcc
14vaddir=`pwd`/mfcc 14vaddir=`pwd`/mfcc
15languages=local/general_lr_closed_set_langs.txt 15languages=local/general_lr_closed_set_langs.txt
16
17data_root=/export/corpora/LDC 16data_root=/export/corpora/LDC
17
18# Training data sources 18# Training data sources
19local/make_sre_2008_train.pl $data_root/LDC2011S05 data 19local/make_sre_2008_train.pl $data_root/LDC2011S05 data
20local/make_callfriend.pl $data_root/LDC96S60 vietnamese data 20local/make_callfriend.pl $data_root/LDC96S60 vietnamese data
diff --git a/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py b/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py
new file mode 100755
index 000000000..72a4572d9
--- /dev/null
+++ b/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py
@@ -0,0 +1,325 @@
1#!/usr/bin/env python3
2
3# Copyright 2017 Johns Hopkins University (Author: Daniel Povey)
4# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
5# 2017 David Snyder
6# Apache 2.0
7
8# This script, which is used in getting training examples, decides
9# which examples will come from which recordings, and at what point
10# during the training.
11
12# You call it as (e.g.)
13#
14# allocate_egs.py --min-frames-per-chunk=50 --max-frames-per-chunk=200 \
15# --frames-per-iter=1000000 --num-repeats=60 --num-archives=169 \
16# --num-jobs=24 exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs
17#
18# The program outputs certain things to the temp directory (e.g.,
19# exp/xvector_a/egs/temp) that will enable you to dump the chunks for xvector
20# training. What we'll eventually be doing is invoking the following program
21# with something like the following args:
22#
23# nnet3-xvector-get-egs [options] exp/xvector_a/temp/ranges.1 \
24# scp:data/train/feats.scp ark:exp/xvector_a/egs/egs_temp.1.ark \
25# ark:exp/xvector_a/egs/egs_temp.2.ark ark:exp/xvector_a/egs/egs_temp.3.ark
26#
27# where exp/xvector_a/temp/ranges.1 contains something like the following:
28#
29# utt1 0 1 0 65 0
30# utt1 6 7 160 50 0
31# utt2 ...
32#
33# where each line is interpreted as follows:
34# <source-utterance> <relative-archive-index> <absolute-archive-index> \
35# <start-frame-index> <num-frames> <spkr-label>
36#
37# Note: <relative-archive-index> is the zero-based offset of the archive-index
38# within the subset of archives that a particular ranges file corresponds to;
39# and <absolute-archive-index> is the 1-based numeric index of the destination
40# archive among the entire list of archives, which will form part of the
41# archive's filename (e.g. egs/egs.<absolute-archive-index>.ark);
42# <absolute-archive-index> is only kept for debug purposes so you can see which
43# archive each line corresponds to.
44#
45# For each line of the ranges file, we specify an eg containing a chunk of data
46# from a given utterane, the corresponding speaker label, and the output
47# archive. The list of archives corresponding to ranges.n will be written to
48# output.n, so in exp/xvector_a/temp/outputs.1 we'd have:
49#
50# ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark \
51# ark:exp/xvector_a/egs/egs_temp.3.ark
52#
53# The number of these files will equal 'num-jobs'. If you add up the
54# word-counts of all the outputs.* files you'll get 'num-archives'. The number
55# of frames in each archive will be about the --frames-per-iter.
56#
57# This program will also output to the temp directory a file called
58# archive_chunk_length which tells you the frame-length associated with
59# each archive, e.g.,
60# 1 60
61# 2 120
62# the format is: <archive-index> <num-frames>. The <num-frames> will always
63# be in the range [min-frames-per-chunk, max-frames-per-chunk].
64
65
66# We're using python 3.x style print but want it to work in python 2.x.
67from __future__ import print_function
68import re, os, argparse, sys, math, warnings, random
69
70def get_args():
71 parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files "
72 "in preparation for dumping egs for xvector training.",
73 epilog="Called by sid/nnet3/xvector/get_egs.sh")
74 parser.add_argument("--prefix", type=str, default="",
75 help="Adds a prefix to the output files. This is used to distinguish between the train "
76 "and diagnostic files.")
77 parser.add_argument("--num-repeats", type=int, default=10, help="Number of times each speaker repeats within an archive.")
78 parser.add_argument("--min-frames-per-chunk", type=int, default=50,
79 help="Minimum number of frames-per-chunk used for any archive")
80 parser.add_argument("--max-frames-per-chunk", type=int, default=300,
81 help="Maximum number of frames-per-chunk used for any archive")
82 parser.add_argument("--randomize-chunk-length", type=str,
83 help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]."
84 "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk"
85 "according to a geometric sequence.",
86 default="true", choices = ["false", "true"])
87 parser.add_argument("--frames-per-iter", type=int, default=1000000,
88 help="Target number of frames for each archive")
89 parser.add_argument("--num-archives", type=int, default=-1,
90 help="Number of archives to write");
91 parser.add_argument("--num-jobs", type=int, default=-1,
92 help="Number of jobs we're going to use to write the archives; the ranges.* "
93 "and outputs.* files are indexed by job. Must be <= the --num-archives option.");
94 parser.add_argument("--seed", type=int, default=123,
95 help="Seed for random number generator")
96 parser.add_argument("--num-pdfs", type=int, default=-1,
97 help="Num pdfs")
98
99 # now the positional arguments
100 parser.add_argument("--utt2len-filename", type=str, required=True,
101 help="utt2len file of the features to be used as input (format is: "
102 "<utterance-id> <num-frames>)");
103 parser.add_argument("--utt2int-filename", type=str, required=True,
104 help="utt2int file of the features to be used as input (format is: "
105 "<utterance-id> <id>)");
106 parser.add_argument("--egs-dir", type=str, required=True,
107 help="Name of egs directory, e.g. exp/xvector_a/egs");
108
109 print(' '.join(sys.argv), file=sys.stderr)
110 print(sys.argv, file=sys.stderr)
111 args = parser.parse_args()
112 args = process_args(args)
113 return args
114
115def process_args(args):
116 if args.num_repeats < 1:
117 raise Exception("--num-repeats should have a minimum value of 1")
118 if not os.path.exists(args.utt2int_filename):
119 raise Exception("This script expects --utt2int-filename to exist")
120 if not os.path.exists(args.utt2len_filename):
121 raise Exception("This script expects --utt2len-filename to exist")
122 if args.min_frames_per_chunk <= 1:
123 raise Exception("--min-frames-per-chunk is invalid.")
124 if args.max_frames_per_chunk < args.min_frames_per_chunk:
125 raise Exception("--max-frames-per-chunk is invalid.")
126 if args.frames_per_iter < 1000:
127 raise Exception("--frames-per-iter is invalid.")
128 if args.num_archives < 1:
129 raise Exception("--num-archives is invalid")
130 if args.num_jobs > args.num_archives:
131 raise Exception("--num-jobs is invalid (must not exceed num-archives)")
132 return args
133
134# Create utt2len
135def get_utt2len(utt2len_filename):
136 utt2len = {}
137 f = open(utt2len_filename, "r")
138 if f is None:
139 sys.exit("Error opening utt2len file " + str(utt2len_filename))
140 utt_ids = []
141 lengths = []
142 for line in f:
143 tokens = line.split()
144 if len(tokens) != 2:
145 sys.exit("bad line in utt2len file " + line)
146 utt2len[tokens[0]] = int(tokens[1])
147 f.close()
148 return utt2len
149 # Done utt2len
150
151# Handle utt2int, create spk2utt, spks
152def get_labels(utt2int_filename):
153 f = open(utt2int_filename, "r")
154 if f is None:
155 sys.exit("Error opening utt2int file " + str(utt2int_filename))
156 spk2utt = {}
157 utt2spk = {}
158 for line in f:
159 tokens = line.split()
160 if len(tokens) != 2:
161 sys.exit("bad line in utt2int file " + line)
162 spk = int(tokens[1])
163 utt = tokens[0]
164 utt2spk[utt] = spk
165 if spk not in spk2utt:
166 spk2utt[spk] = [utt]
167 else:
168 spk2utt[spk].append(utt)
169 spks = spk2utt.keys()
170 f.close()
171 return spks, spk2utt, utt2spk
172 # Done utt2int
173
174
175# this function returns a random integer utterance index, limited to utterances
176# above a minimum length in frames, with probability proportional to its length.
177def get_random_utt(spkr, spk2utt, min_length):
178 this_utts = spk2utt[spkr]
179 this_num_utts = len(this_utts)
180 i = random.randint(0, this_num_utts-1)
181 utt = this_utts[i]
182 return utt
183
184def random_chunk_length(min_frames_per_chunk, max_frames_per_chunk):
185 ans = random.randint(min_frames_per_chunk, max_frames_per_chunk)
186 return ans
187
188# This function returns an integer in the range
189# [min-frames-per-chunk, max-frames-per-chunk] according to a geometric
190# sequence. For example, suppose min-frames-per-chunk is 50,
191# max-frames-per-chunk is 200, and args.num_archives is 3. Then the
192# lengths for archives 0, 1, and 2 will be 50, 100, and 200.
193def deterministic_chunk_length(archive_id, num_archives, min_frames_per_chunk, max_frames_per_chunk):
194 if max_frames_per_chunk == min_frames_per_chunk:
195 return max_frames_per_chunk
196 elif num_archives == 1:
197 return int(max_frames_per_chunk);
198 else:
199 return int(math.pow(float(max_frames_per_chunk) /
200 min_frames_per_chunk, float(archive_id) /
201 (num_archives-1)) * min_frames_per_chunk + 0.5)
202
203
204
205# given an utterance length utt_length (in frames) and two desired chunk lengths
206# (length1 and length2) whose sum is <= utt_length,
207# this function randomly picks the starting points of the chunks for you.
208# the chunks may appear randomly in either order.
209def get_random_offset(utt_length, length):
210 if length > utt_length:
211 sys.exit("code error: length > utt-length")
212 free_length = utt_length - length
213
214 offset = random.randint(0, free_length)
215 return offset
216
217
218def main():
219 args = get_args()
220 if not os.path.exists(args.egs_dir + "/temp"):
221 os.makedirs(args.egs_dir + "/temp")
222 random.seed(args.seed)
223 utt2len = get_utt2len(args.utt2len_filename)
224 spks, spk2utt, utt2spk = get_labels(args.utt2int_filename)
225 if args.num_pdfs == -1:
226 args.num_pdfs = max(spks) + 1
227
228 # archive_chunk_lengths is an mapping from archive id to the number of
229 # frames in examples of that archive.
230 archive_chunk_lengths = []
231 # all_egs contains 2-tuples of the form (utt-id, offset)
232 all_egs= []
233
234 prefix = ""
235 if args.prefix != "":
236 prefix = args.prefix + "_"
237
238 info_f = open(args.egs_dir + "/temp/" + prefix + "archive_chunk_lengths", "w")
239 if info_f is None:
240 sys.exit(str("Error opening file {0}/temp/" + prefix + "archive_chunk_lengths").format(args.egs_dir));
241 for archive_index in range(args.num_archives):
242 print("Processing archive {0}".format(archive_index + 1))
243 if args.randomize_chunk_length == "true":
244 # don't constrain the lengths to be the same
245 length = random_chunk_length(args.min_frames_per_chunk, args.max_frames_per_chunk)
246 else:
247 length = deterministic_chunk_length(archive_index, args.num_archives, args.min_frames_per_chunk, args.max_frames_per_chunk);
248 print("{0} {1}".format(archive_index + 1, length), file=info_f)
249 archive_chunk_lengths.append(length)
250 this_num_egs = int((args.frames_per_iter / length) + 1)
251 this_egs = [ ] # A 2-tuple of the form (utt-id, start-frame)
252 spkrs = args.num_repeats * list(spk2utt.keys())
253 random.shuffle(spkrs)
254 for n in range(this_num_egs):
255 if len(spkrs) == 0:
256 print("Ran out of speakers for archive {0}".format(archive_index + 1))
257 break
258 spkr = spkrs.pop()
259 utt = get_random_utt(spkr, spk2utt, length)
260 utt_len = utt2len[utt]
261 offset = get_random_offset(utt_len, length)
262 this_egs.append( (utt, offset) )
263 all_egs.append(this_egs)
264 info_f.close()
265
266 # work out how many archives we assign to each job in an equitable way.
267 num_archives_per_job = [ 0 ] * args.num_jobs
268 for i in range(0, args.num_archives):
269 num_archives_per_job[i % args.num_jobs] = num_archives_per_job[i % args.num_jobs] + 1
270
271 pdf2num = {}
272 cur_archive = 0
273 for job in range(args.num_jobs):
274 this_ranges = []
275 this_archives_for_job = []
276 this_num_archives = num_archives_per_job[job]
277
278 for i in range(0, this_num_archives):
279 this_archives_for_job.append(cur_archive)
280 for (utterance_index, offset) in all_egs[cur_archive]:
281 this_ranges.append( (utterance_index, i, offset) )
282 cur_archive = cur_archive + 1
283
284 f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w")
285 if f is None:
286 sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1))
287 for (utterance_index, i, offset) in sorted(this_ranges):
288 archive_index = this_archives_for_job[i]
289 print("{0} {1} {2} {3} {4} {5}".format(utterance_index,
290 i,
291 archive_index + 1,
292 offset,
293 archive_chunk_lengths[archive_index],
294 utt2spk[utterance_index]),
295 file=f)
296 if utt2spk[utterance_index] in pdf2num:
297 pdf2num[utt2spk[utterance_index]] += 1
298 else:
299 pdf2num[utt2spk[utterance_index]] = 1
300 f.close()
301
302
303 f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w")
304 if f is None:
305 sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1))
306 print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]),
307 file=f)
308 f.close()
309
310 f = open(args.egs_dir + "/" + prefix + "pdf2num", "w")
311 nums = []
312 for k in range(0, args.num_pdfs):
313 if k in pdf2num:
314 nums.append(pdf2num[k])
315 else:
316 nums.append(0)
317
318 print(" ".join(map(str, nums)), file=f)
319 f.close()
320
321 print("allocate_egs.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files")
322
323if __name__ == "__main__":
324 main()
325
diff --git a/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh b/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh
new file mode 100755
index 000000000..5b8a32b54
--- /dev/null
+++ b/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh
@@ -0,0 +1,102 @@
1#!/bin/bash
2
3# Copyright 2017 David Snyder
4# 2017 Johns Hopkins University (Author: Daniel Povey)
5# 2017 Johns Hopkins University (Author: Daniel Garcia Romero)
6# Apache 2.0.
7
8# This script extracts embeddings (called "xvectors" here) from a set of
9# utterances, given features and a trained DNN. The purpose of this script
10# is analogous to sid/extract_ivectors.sh: it creates archives of
11# vectors that are used in speaker recognition. Like ivectors, xvectors can
12# be used in PLDA or a similar backend for scoring.
13
14# Begin configuration section.
15nj=30
16cmd="run.pl"
17chunk_size=-1 # The chunk size over which the embedding is extracted.
18 # If left unspecified, it uses the max_chunk_size in the nnet
19 # directory.
20use_gpu=false
21stage=0
22
23echo "$0 $@" # Print the command line for logging
24
25if [ -f path.sh ]; then . ./path.sh; fi
26. parse_options.sh || exit 1;
27
28if [ $# != 3 ]; then
29 echo "Usage: $0 <nnet-dir> <data> <xvector-dir>"
30 echo " e.g.: $0 exp/xvector_nnet data/train exp/xvectors_train"
31 echo "main options (for others, see top of script file)"
32 echo " --config <config-file> # config containing options"
33 echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
34 echo " --use-gpu <bool|false> # If true, use GPU."
35 echo " --nj <n|30> # Number of jobs"
36 echo " --stage <stage|0> # To control partial reruns"
37 echo " --chunk-size <n|-1> # If provided, extracts embeddings with specified"
38 echo " # chunk size, and averages to produce final embedding"
39fi
40
41srcdir=$1
42data=$2
43dir=$3
44
45for f in $srcdir/final.raw $srcdir/min_chunk_size $srcdir/max_chunk_size $data/feats.scp $data/vad.scp ; do
46 [ ! -f $f ] && echo "No such file $f" && exit 1;
47done
48
49min_chunk_size=`cat $srcdir/min_chunk_size 2>/dev/null`
50max_chunk_size=`cat $srcdir/max_chunk_size 2>/dev/null`
51
52nnet=$srcdir/final.raw
53if [ -f $srcdir/extract.config ] ; then
54 echo "$0: using $srcdir/extract.config to extract xvectors"
55 nnet="nnet3-copy --nnet-config=$srcdir/extract.config $srcdir/final.raw - |"
56fi
57
58if [ $chunk_size -le 0 ]; then
59 chunk_size=$max_chunk_size
60fi
61
62if [ $max_chunk_size -lt $chunk_size ]; then
63 echo "$0: specified chunk size of $chunk_size is larger than the maximum chunk size, $max_chunk_size" && exit 1;
64fi
65
66mkdir -p $dir/log
67
68utils/split_data.sh $data $nj
69echo "$0: extracting xvectors for $data"
70sdata=$data/split$nj/JOB
71
72# Set up the features
73feat="ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sdata}/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:${sdata}/vad.scp ark:- |"
74
75if [ $stage -le 0 ]; then
76 echo "$0: extracting xvectors from nnet"
77 if $use_gpu; then
78 for g in $(seq $nj); do
79 $cmd --gpu 1 ${dir}/log/extract.$g.log \
80 nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \
81 "$nnet" "`echo $feat | sed s/JOB/$g/g`" ark,scp:${dir}/xvector.$g.ark,${dir}/xvector.$g.scp || exit 1 &
82 done
83 wait
84 else
85 $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
86 nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \
87 "$nnet" "$feat" ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp || exit 1;
88 fi
89fi
90
91if [ $stage -le 1 ]; then
92 echo "$0: combining xvectors across jobs"
93 for j in $(seq $nj); do cat $dir/xvector.$j.scp; done >$dir/xvector.scp || exit 1;
94fi
95
96if [ $stage -le 2 ]; then
97 # Average the utterance-level xvectors to get speaker-level xvectors.
98 echo "$0: computing mean of xvectors for each speaker"
99 $cmd $dir/log/speaker_mean.log \
100 ivector-mean ark:$data/spk2utt scp:$dir/xvector.scp \
101 ark,scp:$dir/spk_xvector.ark,$dir/spk_xvector.scp ark,t:$dir/num_utts.ark || exit 1;
102fi
diff --git a/egs/sre08/v1/sid/nnet3/xvector/get_egs.sh b/egs/sre08/v1/sid/nnet3/xvector/get_egs.sh
new file mode 100755
index 000000000..3f2200c11
--- /dev/null
+++ b/egs/sre08/v1/sid/nnet3/xvector/get_egs.sh
@@ -0,0 +1,247 @@
1#!/bin/bash
2
3# Copyright 2017 Johns Hopkins University (Author: Daniel Povey)
4# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
5# 2017 David Snyder
6# Apache 2.0
7#
8# This script dumps training examples (egs) for multiclass xvector training.
9# These egs consist of a data chunk and a zero-based speaker label.
10# Each archive of egs has, in general, a different input chunk-size.
11# We don't mix together different lengths in the same archive, because it
12# would require us to repeatedly run the compilation process within the same
13# training job.
14#
15# This script, which will generally be called from other neural net training
16# scripts, extracts the training examples used to train the neural net (and
17# also the validation examples used for diagnostics), and puts them in
18# separate archives.
19
20
21# Begin configuration section.
22cmd=run.pl
23# each archive has data-chunks off length randomly chosen between
24# $min_frames_per_eg and $max_frames_per_eg.
25min_frames_per_chunk=50
26max_frames_per_chunk=300
27frames_per_iter=10000000 # target number of frames per archive.
28
29frames_per_iter_diagnostic=100000 # have this many frames per archive for
30 # the archives used for diagnostics.
31
32num_diagnostic_archives=3 # we want to test the training likelihoods
33 # on a range of utterance lengths, and this number controls
34 # how many archives we evaluate on.
35
36
37compress=true # set this to false to disable compression (e.g. if you want to see whether
38 # results are affected).
39
40num_heldout_utts=100 # number of utterances held out for training subset
41
42num_repeats=1 # number of times each speaker repeats per archive
43
44stage=0
45nj=6 # This should be set to the maximum number of jobs you are
46 # comfortable to run in parallel; you can increase it if your disk
47 # speed is greater and you have more machines.
48
49echo "$0 $@" # Print the command line for logging
50
51if [ -f path.sh ]; then . ./path.sh; fi
52. parse_options.sh || exit 1;
53
54if [ $# != 2 ]; then
55 echo "Usage: $0 [opts] <data> <egs-dir>"
56 echo " e.g.: $0 data/train exp/xvector_a/egs"
57 echo ""
58 echo "Main options (for others, see top of script file)"
59 echo " --config <config-file> # config file containing options"
60 echo " --nj <nj> # The maximum number of jobs you want to run in"
61 echo " # parallel (increase this only if you have good disk and"
62 echo " # network speed). default=6"
63 echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
64 echo " --min-frames-per-eg <#frames;50> # The minimum number of frames per chunk that we dump"
65 echo " --max-frames-per-eg <#frames;200> # The maximum number of frames per chunk that we dump"
66 echo " --num-repeats <#repeats;1> # The (approximate) number of times the training"
67 echo " # data is repeated in the egs"
68 echo " --frames-per-iter <#samples;1000000> # Target number of frames per archive"
69 echo " --num-diagnostic-archives <#archives;3> # Option that controls how many different versions of"
70 echo " # the train and validation archives we create (e.g."
71 echo " # train_subset.{1,2,3}.egs and valid.{1,2,3}.egs by default;"
72 echo " # they contain different utterance lengths."
73 echo " --frames-per-iter-diagnostic <#samples;100000> # Target number of frames for the diagnostic archives"
74 echo " # {train_subset,valid}.*.egs"
75 echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in"
76 echo " # the middle."
77
78 exit 1;
79fi
80
81data=$1
82dir=$2
83
84for f in $data/utt2num_frames $data/feats.scp ; do
85 [ ! -f $f ] && echo "$0: expected file $f" && exit 1;
86done
87
88feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1
89
90mkdir -p $dir/info $dir/info $dir/temp
91temp=$dir/temp
92
93echo $feat_dim > $dir/info/feat_dim
94echo '0' > $dir/info/left_context
95# The examples have at least min_frames_per_chunk right context.
96echo $min_frames_per_chunk > $dir/info/right_context
97echo '1' > $dir/info/frames_per_eg
98cp $data/utt2num_frames $dir/temp/utt2num_frames
99
100if [ $stage -le 0 ]; then
101 echo "$0: Preparing train and validation lists"
102 # Pick a list of heldout utterances for validation egs
103 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/valid_uttlist || exit 1;
104 # The remaining utterances are used for training egs
105 utils/filter_scp.pl --exclude $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.train
106 utils/filter_scp.pl $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.valid
107 # Pick a subset of the training list for diagnostics
108 awk '{print $1}' $temp/utt2num_frames.train | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/train_subset_uttlist || exit 1;
109 utils/filter_scp.pl $temp/train_subset_uttlist <$temp/utt2num_frames.train > $temp/utt2num_frames.train_subset
110 # Create a mapping from utterance to speaker ID (an integer)
111 awk -v id=0 '{print $1, id++}' $data/spk2utt > $temp/spk2int
112 utils/sym2int.pl -f 2 $temp/spk2int $data/utt2spk > $temp/utt2int
113 utils/filter_scp.pl $temp/utt2num_frames.train $temp/utt2int > $temp/utt2int.train
114 utils/filter_scp.pl $temp/utt2num_frames.valid $temp/utt2int > $temp/utt2int.valid
115 utils/filter_scp.pl $temp/utt2num_frames.train_subset $temp/utt2int > $temp/utt2int.train_subset
116fi
117
118num_pdfs=$(awk '{print $2}' $temp/utt2int | sort | uniq -c | wc -l)
119# The script assumes you've prepared the features ahead of time.
120feats="scp,s,cs:utils/filter_scp.pl $temp/ranges.JOB $data/feats.scp |"
121train_subset_feats="scp,s,cs:utils/filter_scp.pl $temp/train_subset_ranges.1 $data/feats.scp |"
122valid_feats="scp,s,cs:utils/filter_scp.pl $temp/valid_ranges.1 $data/feats.scp |"
123
124# first for the training data... work out how many archives.
125num_train_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train)
126num_train_subset_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train_subset)
127
128echo $num_train_frames >$dir/info/num_frames
129num_train_archives=$[($num_train_frames*$num_repeats)/$frames_per_iter + 1]
130echo "$0: Producing $num_train_archives archives for training"
131echo $num_train_archives > $dir/info/num_archives
132echo $num_diagnostic_archives > $dir/info/num_diagnostic_archives
133
134if [ $nj -gt $num_train_archives ]; then
135 echo "$0: Reducing num-jobs $nj to number of training archives $num_train_archives"
136 nj=$num_train_archives
137fi
138
139if [ $stage -le 1 ]; then
140 if [ -e $dir/storage ]; then
141 # Make soft links to storage directories, if distributing this way.. See
142 # utils/create_split_dir.pl.
143 echo "$0: creating data links"
144 utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs.$x.ark; done)
145 utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done)
146 fi
147fi
148
149if [ $stage -le 2 ]; then
150 echo "$0: Allocating training examples"
151 $cmd $dir/log/allocate_examples_train.log \
152 sid/nnet3/xvector/allocate_egs.py \
153 --num-repeats=$num_repeats \
154 --min-frames-per-chunk=$min_frames_per_chunk \
155 --max-frames-per-chunk=$max_frames_per_chunk \
156 --frames-per-iter=$frames_per_iter \
157 --num-archives=$num_train_archives --num-jobs=$nj \
158 --utt2len-filename=$dir/temp/utt2num_frames.train \
159 --utt2int-filename=$dir/temp/utt2int.train --egs-dir=$dir || exit 1
160
161 echo "$0: Allocating training subset examples"
162 $cmd $dir/log/allocate_examples_train_subset.log \
163 sid/nnet3/xvector/allocate_egs.py \
164 --prefix train_subset \
165 --num-repeats=1 \
166 --min-frames-per-chunk=$min_frames_per_chunk \
167 --max-frames-per-chunk=$max_frames_per_chunk \
168 --randomize-chunk-length false \
169 --frames-per-iter=$frames_per_iter_diagnostic \
170 --num-archives=$num_diagnostic_archives --num-jobs=1 \
171 --utt2len-filename=$dir/temp/utt2num_frames.train_subset \
172 --utt2int-filename=$dir/temp/utt2int.train_subset --egs-dir=$dir || exit 1
173
174 echo "$0: Allocating validation examples"
175 $cmd $dir/log/allocate_examples_valid.log \
176 sid/nnet3/xvector/allocate_egs.py \
177 --prefix valid \
178 --num-repeats=1 \
179 --min-frames-per-chunk=$min_frames_per_chunk \
180 --max-frames-per-chunk=$max_frames_per_chunk \
181 --randomize-chunk-length false \
182 --frames-per-iter=$frames_per_iter_diagnostic \
183 --num-archives=$num_diagnostic_archives --num-jobs=1 \
184 --utt2len-filename=$dir/temp/utt2num_frames.valid \
185 --utt2int-filename=$dir/temp/utt2int.valid --egs-dir=$dir || exit 1
186fi
187
188# At this stage we'll have created the ranges files that define how many egs
189# there are and where they come from. If this is your first time running this
190# script, you might decide to put an exit 1 command here, and inspect the
191# contents of exp/$dir/temp/ranges.* before proceeding to the next stage.
192if [ $stage -le 3 ]; then
193 echo "$0: Generating training examples on disk"
194 rm $dir/.error 2>/dev/null
195 for g in $(seq $nj); do
196 outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/outputs.$g)
197 $cmd $dir/log/train_create_examples.$g.log \
198 nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/ranges.$g \
199 "`echo $feats | sed s/JOB/$g/g`" $outputs || touch $dir/.error &
200 done
201 train_subset_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/train_subset_outputs.1)
202 echo "$0: Generating training subset examples on disk"
203 $cmd $dir/log/train_subset_create_examples.1.log \
204 nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/train_subset_ranges.1 \
205 "$train_subset_feats" $train_subset_outputs || touch $dir/.error &
206 wait
207 valid_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/valid_outputs.1)
208 echo "$0: Generating validation examples on disk"
209 $cmd $dir/log/valid_create_examples.1.log \
210 nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/valid_ranges.1 \
211 "$valid_feats" $valid_outputs || touch $dir/.error &
212 wait
213 if [ -f $dir/.error ]; then
214 echo "$0: Problem detected while dumping examples"
215 exit 1
216 fi
217fi
218
219if [ $stage -le 4 ]; then
220 echo "$0: Shuffling order of archives on disk"
221 $cmd --max-jobs-run $nj JOB=1:$num_train_archives $dir/log/shuffle.JOB.log \
222 nnet3-shuffle-egs --srand=JOB ark:$dir/egs_temp.JOB.ark \
223 ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp || exit 1;
224 $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/train_subset_shuffle.JOB.log \
225 nnet3-shuffle-egs --srand=JOB ark:$dir/train_subset_egs_temp.JOB.ark \
226 ark,scp:$dir/train_diagnostic_egs.JOB.ark,$dir/train_diagnostic_egs.JOB.scp || exit 1;
227 $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/valid_shuffle.JOB.log \
228 nnet3-shuffle-egs --srand=JOB ark:$dir/valid_egs_temp.JOB.ark \
229 ark,scp:$dir/valid_egs.JOB.ark,$dir/valid_egs.JOB.scp || exit 1;
230fi
231
232if [ $stage -le 5 ]; then
233 for file in $(for x in $(seq $num_diagnostic_archives); do echo $dir/train_subset_egs_temp.$x.ark; done) \
234 $(for x in $(seq $num_diagnostic_archives); do echo $dir/valid_egs_temp.$x.ark; done) \
235 $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done); do
236 [ -L $file ] && rm $(readlink -f $file)
237 rm $file
238 done
239 rm -rf $dir/valid_diagnostic.scp $dir/train_diagnostic.scp
240 for x in $(seq $num_diagnostic_archives); do
241 cat $dir/train_diagnostic_egs.$x.scp >> $dir/train_diagnostic.scp
242 cat $dir/valid_egs.$x.scp >> $dir/valid_diagnostic.scp
243 done
244 ln -sf train_diagnostic.scp $dir/combine.scp
245fi
246
247echo "$0: Finished preparing training examples"
diff --git a/egs/sre16/README.txt b/egs/sre16/README.txt
new file mode 100644
index 000000000..24eb4a5d1
--- /dev/null
+++ b/egs/sre16/README.txt
@@ -0,0 +1,20 @@
1
2 This directory (sre16) contains example scripts for the NIST SRE 2016
3 speaker recognition evaluation. The following corpora are required to
4 perform the evaluation:
5
6 NIST SRE 2016 enroll set
7 NIST SRE 2016 test set
8
9 More details on NIST SRE 2016 can be found at the url
10 https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016.
11
12 Additional data sources (mostly past NIST SREs, Switchboard, etc) are
13 required to train the systems in the subdirectories. See the
14 corresponding README.txt files in the subdirectories for more details.
15
16 The subdirectories "v1" and so on are different speaker recognition
17 recipes. The recipe in v1 demonstrates a standard approach using a
18 full-covariance GMM-UBM, iVectors, and a PLDA backend. The example
19 in v2 demonstrates DNN speaker embeddings with a PLDA backend.
20
diff --git a/egs/sre16/v1/README.txt b/egs/sre16/v1/README.txt
new file mode 100644
index 000000000..41c00420f
--- /dev/null
+++ b/egs/sre16/v1/README.txt
@@ -0,0 +1,29 @@
1 This example demonstrates a traditional iVector system evaluated on NIST
2 SRE 2016. It is based on the recipe in ../../sre10/v1/. In addition to the
3 standard features of the SRE10 recipe, it also demonstrates the use of data
4 augmentation for PLDA training.
5
6 The recipe uses the following data for system development. This is in
7 addition to the NIST SRE 2016 dataset used for evaluation (see ../README.txt).
8
9 Corpus LDC Catalog No.
10 SWBD2 Phase 1 LDC98S75
11 SWBD2 Phase 2 LDC99S79
12 SWBD2 Phase 3 LDC2002S06
13 SWBD Cellular 1 LDC2001S13
14 SWBD Cellular 2 LDC2004S07
15 SRE2004 LDC2006S44
16 SRE2005 Train LDC2011S01
17 SRE2005 Test LDC2011S04
18 SRE2006 Train LDC2011S09
19 SRE2006 Test 1 LDC2011S10
20 SRE2006 Test 2 LDC2012S01
21 SRE2008 Train LDC2011S05
22 SRE2008 Test LDC2011S08
23 SRE2010 Eval LDC2017S06
24 Mixer 6 LDC2013S03
25
26 The following datasets are used in data augmentation.
27
28 MUSAN http://www.openslr.org/17
29 RIR_NOISES http://www.openslr.org/28
diff --git a/egs/sre16/v1/cmd.sh b/egs/sre16/v1/cmd.sh
new file mode 100755
index 000000000..d1ca1a6d1
--- /dev/null
+++ b/egs/sre16/v1/cmd.sh
@@ -0,0 +1,15 @@
1# you can change cmd.sh depending on what type of queue you are using.
2# If you have no queueing system and want to run on a local machine, you
3# can change all instances 'queue.pl' to run.pl (but be careful and run
4# commands one by one: most recipes will exhaust the memory on your
5# machine). queue.pl works with GridEngine (qsub). slurm.pl works
6# with slurm. Different queues are configured differently, with different
7# queue names and different ways of specifying things like memory;
8# to account for these differences you can create and edit the file
9# conf/queue.conf to match your queue's configuration. Search for
10# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12
13export train_cmd="queue.pl --mem 4G"
14
15
diff --git a/egs/sre16/v1/conf/mfcc.conf b/egs/sre16/v1/conf/mfcc.conf
new file mode 100644
index 000000000..e09ee9385
--- /dev/null
+++ b/egs/sre16/v1/conf/mfcc.conf
@@ -0,0 +1,6 @@
1--sample-frequency=8000
2--frame-length=25 # the default is 25
3--low-freq=20 # the default.
4--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
5--num-ceps=20 # higher than the default which is 12.
6--snip-edges=false
diff --git a/egs/sre16/v1/conf/vad.conf b/egs/sre16/v1/conf/vad.conf
new file mode 100644
index 000000000..a0ca2449b
--- /dev/null
+++ b/egs/sre16/v1/conf/vad.conf
@@ -0,0 +1,2 @@
1--vad-energy-threshold=5.5
2--vad-energy-mean-scale=0.5
diff --git a/egs/sre16/v1/local/make_musan.py b/egs/sre16/v1/local/make_musan.py
new file mode 100755
index 000000000..b0bb362e0
--- /dev/null
+++ b/egs/sre16/v1/local/make_musan.py
@@ -0,0 +1,95 @@
1#!/usr/bin/env python3
2# Copyright 2015 David Snyder
3# Apache 2.0.
4#
5# This file is meant to be invoked by make_musan.sh.
6
7import os, sys
8
9def process_music_annotations(path):
10 utt2spk = {}
11 utt2vocals = {}
12 lines = open(path, 'r').readlines()
13 for line in lines:
14 utt, genres, vocals, musician = line.rstrip().split()[:4]
15 # For this application, the musican ID isn't important
16 utt2spk[utt] = utt
17 utt2vocals[utt] = vocals == "Y"
18 return utt2spk, utt2vocals
19
20def prepare_music(root_dir, use_vocals):
21 utt2vocals = {}
22 utt2spk = {}
23 utt2wav = {}
24 music_dir = os.path.join(root_dir, "music")
25 for root, dirs, files in os.walk(music_dir):
26 for file in files:
27 file_path = os.path.join(root, file)
28 if file.endswith(".wav"):
29 utt = str(file).replace(".wav", "")
30 utt2wav[utt] = file_path
31 elif str(file) == "ANNOTATIONS":
32 utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
33 utt2spk.update(utt2spk_part)
34 utt2vocals.update(utt2vocals_part)
35 utt2spk_str = ""
36 utt2wav_str = ""
37 for utt in utt2vocals:
38 if use_vocals or not utt2vocals[utt]:
39 utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
40 utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
41 return utt2spk_str, utt2wav_str
42
43def prepare_speech(root_dir):
44 utt2spk = {}
45 utt2wav = {}
46 speech_dir = os.path.join(root_dir, "speech")
47 for root, dirs, files in os.walk(speech_dir):
48 for file in files:
49 file_path = os.path.join(root, file)
50 if file.endswith(".wav"):
51 utt = str(file).replace(".wav", "")
52 utt2wav[utt] = file_path
53 utt2spk[utt] = utt
54 utt2spk_str = ""
55 utt2wav_str = ""
56 for utt in utt2spk:
57 utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
58 utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
59 return utt2spk_str, utt2wav_str
60
61def prepare_noise(root_dir):
62 utt2spk = {}
63 utt2wav = {}
64 speech_dir = os.path.join(root_dir, "noise")
65 for root, dirs, files in os.walk(speech_dir):
66 for file in files:
67 file_path = os.path.join(root, file)
68 if file.endswith(".wav"):
69 utt = str(file).replace(".wav", "")
70 utt2wav[utt] = file_path
71 utt2spk[utt] = utt
72 utt2spk_str = ""
73 utt2wav_str = ""
74 for utt in utt2spk:
75 utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
76 utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
77 return utt2spk_str, utt2wav_str
78
79def main():
80 in_dir = sys.argv[1]
81 out_dir = sys.argv[2]
82 use_vocals = sys.argv[3] == "Y"
83 utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
84 utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
85 utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
86 utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
87 utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
88 wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
89 wav_fi.write(utt2wav)
90 utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
91 utt2spk_fi.write(utt2spk)
92
93
94if __name__=="__main__":
95 main()
diff --git a/egs/sre16/v1/local/make_musan.sh b/egs/sre16/v1/local/make_musan.sh
new file mode 100755
index 000000000..1faac0ef5
--- /dev/null
+++ b/egs/sre16/v1/local/make_musan.sh
@@ -0,0 +1,37 @@
1#!/bin/bash
2# Copyright 2015 David Snyder
3# Apache 2.0.
4#
5# This script, called by ../run.sh, creates the MUSAN
6# data directory. The required dataset is freely available at
7# http://www.openslr.org/17/
8
9set -e
10in_dir=$1
11data_dir=$2
12use_vocals='Y'
13
14rm -rf local/musan.tmp
15mkdir local/musan.tmp
16
17echo "Preparing ${data_dir}/musan..."
18mkdir -p ${data_dir}/musan
19local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
20utils/fix_data_dir.sh ${data_dir}/musan
21
22grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
23grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
24grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
25utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
26 ${data_dir}/musan ${data_dir}/musan_music
27utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
28 ${data_dir}/musan ${data_dir}/musan_speech
29utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
30 ${data_dir}/musan ${data_dir}/musan_noise
31
32utils/fix_data_dir.sh ${data_dir}/musan_music
33utils/fix_data_dir.sh ${data_dir}/musan_speech
34utils/fix_data_dir.sh ${data_dir}/musan_noise
35
36rm -rf local/musan.tmp
37
diff --git a/egs/sre16/v1/local/make_mx6.sh b/egs/sre16/v1/local/make_mx6.sh
new file mode 100755
index 000000000..4e0df1350
--- /dev/null
+++ b/egs/sre16/v1/local/make_mx6.sh
@@ -0,0 +1,41 @@
1#!/bin/bash
2# Copyright 2017 David Snyder
3# Apache 2.0.
4#
5# This script prepares both the microphone and telephone portions of the
6# Mixer 6 corpus.
7if [ $# -ne 2 ]; then
8 echo "Usage: $0 <mixer6-speech> <out-dir>"
9 echo "e.g.: $0 /export/corpora/LDC/LDC2013S03 data/"
10 exit 1;
11fi
12
13set -e
14in_dir=$1
15out_dir=$2
16
17# Mic 01 is the lapel mic for the interviewer, so we don't use it. Mic 02 is
18# the lapel mic for the interviewee. All other mics are placed throughout the
19# room. In addition to mic 01, we omit mics 03 and 14 as they are often
20# silent.
21echo "$0: preparing mic speech (excluding 01, 03, and 14)"
22
23for mic in 02 04 05 06 07 08 09 10 11 12 13; do
24 local/make_mx6_mic.pl $in_dir $mic $out_dir
25done
26
27utils/combine_data.sh $out_dir/mx6_mic_04_to_13 $out_dir/mx6_mic_{04,05,06,07,08,09,10,11,12,13}
28
29# Mics 02-13 contain the same content, but recorded from different microphones.
30# To get some channel diversity, but not be overwhelmed with duplicated data
31# we take a 2k subset from mics 04-13 and combine it with all of mic 02.
32echo "$0: selecting a 2k subset of mics 04 through 13 and combining it with mic 02"
33utils/subset_data_dir.sh $out_dir/mx6_mic_04_to_13 2000 $out_dir/mx6_mic_04_to_13_2k
34utils/combine_data.sh $out_dir/mx6_mic $out_dir/mx6_mic_02 $out_dir/mx6_mic_04_to_13_2k
35
36echo "$0: preparing telephone portion"
37local/make_mx6_calls.pl $in_dir $out_dir
38
39echo "$0 combining mic and telephone speech in data/mx6"
40utils/combine_data.sh $out_dir/mx6 $out_dir/mx6_mic $out_dir/mx6_calls
41utils/fix_data_dir.sh $out_dir/mx6
diff --git a/egs/sre16/v1/local/make_mx6_calls.pl b/egs/sre16/v1/local/make_mx6_calls.pl
new file mode 100755
index 000000000..ed9d63752
--- /dev/null
+++ b/egs/sre16/v1/local/make_mx6_calls.pl
@@ -0,0 +1,105 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3# Copyright 2017 David Snyder
4# Apache 2.0
5#
6# Prepares the telephone portion of Mixer 6 (LDC2013S03).
7
8if (@ARGV != 2) {
9 print STDERR "Usage: $0 <path-to-LDC2013S03> <path-to-output>\n";
10 print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 data/\n";
11 exit(1);
12}
13($db_base, $out_dir) = @ARGV;
14
15if (! -d "$db_base/mx6_speech/data/ulaw_sphere/") {
16 print STDERR "Directory $db_base/mx6_speech/data/ulaw_sphere/ doesn't exist\n";
17 exit(1);
18}
19
20$out_dir = "$out_dir/mx6_calls";
21
22$tmp_dir = "$out_dir/tmp";
23if (system("mkdir -p $tmp_dir") != 0) {
24 die "Error making directory $tmp_dir";
25}
26
27if (system("mkdir -p $out_dir") != 0) {
28 print STDERR "Error making directory $out_dir\n";
29 exit(1);
30}
31
32%call2sph = ();
33open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv";
34open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
35open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
36open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
37open(META, "<$db_base/mx6_speech/docs/mx6_calls.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_calls.csv";
38
39if (system("find $db_base/mx6_speech/data/ulaw_sphere/ -name '*.sph' > $tmp_dir/sph.list") != 0) {
40 die "Error getting list of sph files";
41}
42open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
43
44while(<SPHLIST>) {
45 chomp;
46 $sph = $_;
47 @toks = split("/",$sph);
48 $sph_id = (split("[./]",$toks[$#toks]))[0];
49 $call_id = (split("_", $sph_id))[2];
50 $call2sph[$call_id] = $sph;
51}
52
53while (<SUBJECTS>) {
54 chomp;
55 $line = $_;
56 @toks = split(",", $line);
57 $spk = $toks[0];
58 $gender = lc $toks[1];
59 if ($gender eq "f" or $gender eq "m") {
60 print GNDR "$spk $gender\n";
61 }
62}
63
64$num_good_files = 0;
65$num_bad_files = 0;
66while (<META>) {
67 chomp;
68 $line = $_;
69 @toks = split(",", $line);
70 $call_id = $toks[0];
71 ($call_date, $call_time) = split(/_/, $toks[1]);
72 $sid_A = $toks[4];
73 $sid_B = $toks[12];
74 if (-f $call2sph[$call_id]) {
75 $utt_A = "${sid_A}_MX6_${call_id}_A";
76 $utt_B = "${sid_B}_MX6_${call_id}_B";
77 print SPKR "${utt_A} $sid_A\n";
78 print SPKR "${utt_B} $sid_B\n";
79 print WAV "${utt_A} sph2pipe -f wav -p -c 1 $call2sph[$call_id] |\n";
80 print WAV "${utt_B} sph2pipe -f wav -p -c 2 $call2sph[$call_id] |\n";
81 $num_good_files++;
82 } else {
83 print STDERR "Sphere file for $call_id doesn't exist\n";
84 $num_bad_files++;
85 }
86}
87
88print STDERR "Processed $num_good_files utterances; $num_bad_files had missing sphere data.\n";
89
90close(SPHLIST) || die;
91close(SUBJECTS) || die;
92close(GNDR) || die;
93close(SPKR) || die;
94close(WAV) || die;
95close(META) || die;
96
97if (system(
98 "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
99 die "Error creating spk2utt file in directory $out_dir";
100}
101
102system("utils/fix_data_dir.sh $out_dir");
103if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
104 die "Error validating directory $out_dir";
105}
diff --git a/egs/sre16/v1/local/make_mx6_mic.pl b/egs/sre16/v1/local/make_mx6_mic.pl
new file mode 100755
index 000000000..7e1b4046e
--- /dev/null
+++ b/egs/sre16/v1/local/make_mx6_mic.pl
@@ -0,0 +1,92 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3# Copyright 2017 David Snyder
4# Apache 2.0
5# Prepares Mixer 6 (LDC2013S03) speech from a specified microphone and
6# downsamples it to 8k.
7
8if (@ARGV != 3) {
9 print STDERR "Usage: $0 <path-to-LDC2013S03> <channel> <path-to-output>\n";
10 print STDERR "e.g. $0 /export/corpora5/LDC/LDC2013S03 02 data/\n";
11 exit(1);
12}
13($db_base, $ch, $out_dir) = @ARGV;
14
15@bad_channels = ("01", "03", "14");
16if (/$ch/i ~~ @bad_channels) {
17 print STDERR "Bad channel $ch\n";
18 exit(1);
19}
20
21if (! -d "$db_base/mx6_speech/data/pcm_flac/CH$ch/") {
22 print STDERR "Directory $db_base/mx6_speech/data/pcm_flac/CH$ch/ doesn't exist\n";
23 exit(1);
24}
25
26$out_dir = "$out_dir/mx6_mic_$ch";
27if (system("mkdir -p $out_dir")) {
28 print STDERR "Error making directory $out_dir\n";
29 exit(1);
30}
31
32if (system("mkdir -p $out_dir") != 0) {
33 print STDERR "Error making directory $out_dir\n";
34 exit(1);
35}
36
37open(SUBJECTS, "<$db_base/mx6_speech/docs/mx6_subjs.csv") || die "cannot open $$db_base/mx6_speech/docs/mx6_subjs.csv";
38open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
39open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
40open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
41open(META, "<$db_base/mx6_speech/docs/mx6_ivcomponents.csv") || die "cannot open $db_base/mx6_speech/docs/mx6_ivcomponents.csv";
42
43while (<SUBJECTS>) {
44 chomp;
45 $line = $_;
46 @toks = split(",", $line);
47 $spk = $toks[0];
48 $gender = lc $toks[1];
49 if ($gender eq "f" or $gender eq "m") {
50 print GNDR "$spk $gender\n";
51 }
52}
53
54$num_good_files = 0;
55$num_bad_files = 0;
56while (<META>) {
57 chomp;
58 $line = $_;
59 @toks = split(",", $line);
60 $flac = "$db_base/mx6_speech/data/pcm_flac/CH$ch/$toks[0]_CH$ch.flac";
61 $t1 = $toks[7];
62 $t2 = $toks[8];
63 @toks2 = split(/_/, $toks[0]);
64 $spk = $toks2[3];
65 $utt = "${spk}_MX6_$toks2[0]_$toks2[1]_$ch";
66 if (-f $flac) {
67 print SPKR "${utt} $spk\n";
68 print WAV "${utt} sox -t flac $flac -r 8k -t wav - trim $t1 =$t2 |\n";
69 $num_good_files++;
70 } else {
71 print STDERR "File $flac doesn't exist\n";
72 $num_bad_files++;
73 }
74}
75
76print STDERR "Processed $num_good_files utterances; $num_bad_files had missing flac data.\n";
77
78close(SUBJECTS) || die;
79close(GNDR) || die;
80close(SPKR) || die;
81close(WAV) || die;
82close(META) || die;
83
84if (system(
85 "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
86 die "Error creating spk2utt file in directory $out_dir";
87}
88
89system("utils/fix_data_dir.sh $out_dir");
90if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
91 die "Error validating directory $out_dir";
92}
diff --git a/egs/sre16/v1/local/make_sre.pl b/egs/sre16/v1/local/make_sre.pl
new file mode 100755
index 000000000..d6e1abf94
--- /dev/null
+++ b/egs/sre16/v1/local/make_sre.pl
@@ -0,0 +1,75 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3#
4# Copyright 2015 David Snyder
5# Apache 2.0.
6# Usage: make_sre.pl <path-to-data> <name-of-source> <sre-ref> <output-dir>
7
8if (@ARGV != 4) {
9 print STDERR "Usage: $0 <path-to-data> <name-of-source> <sre-ref> <output-dir>\n";
10 print STDERR "e.g. $0 /export/corpora5/LDC/LDC2006S44 sre2004 sre_ref data/sre2004\n";
11 exit(1);
12}
13
14($db_base, $sre_year, $sre_ref_filename, $out_dir) = @ARGV;
15%utt2sph = ();
16%spk2gender = ();
17
18$tmp_dir = "$out_dir/tmp";
19if (system("mkdir -p $tmp_dir") != 0) {
20 die "Error making directory $tmp_dir";
21}
22
23if (system("find -L $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
24 die "Error getting list of sph files";
25}
26open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
27
28while(<WAVLIST>) {
29 chomp;
30 $sph = $_;
31 @A1 = split("/",$sph);
32 @A2 = split("[./]",$A1[$#A1]);
33 $uttId=$A2[0];
34 $utt2sph{$uttId} = $sph;
35}
36
37open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
38open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
39open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
40open(SRE_REF, "<$sre_ref_filename") or die "Cannot open SRE reference.";
41while (<SRE_REF>) {
42 chomp;
43 ($speaker, $gender, $other_sre_year, $utt_id, $channel) = split(" ", $_);
44 $channel_num = "1";
45 if ($channel eq "A") {
46 $channel_num = "1";
47 } else {
48 $channel_num = "2";
49 }
50 $channel = lc $channel;
51 if (($other_sre_year eq "sre20$sre_year") and (exists $utt2sph{$utt_id})) {
52 $full_utt_id = "$speaker-sre$sre_year-$utt_id-$channel";
53 $spk2gender{"$speaker"} = $gender;
54 print WAV "$full_utt_id"," sph2pipe -f wav -p -c $channel_num $utt2sph{$utt_id} |\n";
55 print SPKR "$full_utt_id $speaker","\n";
56 }
57}
58foreach $speaker (keys %spk2gender) {
59 print GNDR "$speaker $spk2gender{$speaker}\n";
60}
61
62close(GNDR) || die;
63close(SPKR) || die;
64close(WAV) || die;
65close(SRE_REF) || die;
66
67if (system(
68 "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
69 die "Error creating spk2utt file in directory $out_dir";
70}
71
72system("utils/fix_data_dir.sh $out_dir");
73if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
74 die "Error validating directory $out_dir";
75}
diff --git a/egs/sre16/v1/local/make_sre.sh b/egs/sre16/v1/local/make_sre.sh
new file mode 100755
index 000000000..45e75ac8a
--- /dev/null
+++ b/egs/sre16/v1/local/make_sre.sh
@@ -0,0 +1,34 @@
1#!/bin/bash
2# Copyright 2017 David Snyder
3# Apache 2.0.
4#
5# See README.txt for more info on data required.
6
7set -e
8
9data_root=$1
10data_dir=$2
11
12wget -P data/local/ http://www.openslr.org/resources/15/speaker_list.tgz
13tar -C data/local/ -xvf data/local/speaker_list.tgz
14sre_ref=data/local/speaker_list
15
16local/make_sre.pl $data_root/LDC2006S44/ \
17 04 $sre_ref $data_dir/sre2004
18
19local/make_sre.pl $data_root/LDC2011S01 \
20 05 $sre_ref $data_dir/sre2005_train
21
22local/make_sre.pl $data_root/LDC2011S04 \
23 05 $sre_ref $data_dir/sre2005_test
24
25local/make_sre.pl $data_root/LDC2011S09 \
26 06 $sre_ref $data_dir/sre2006_train
27
28local/make_sre.pl $data_root/LDC2011S10 \
29 06 $sre_ref $data_dir/sre2006_test_1
30
31local/make_sre.pl $data_root/LDC2012S01 \
32 06 $sre_ref $data_dir/sre2006_test_2
33
34rm data/local/speaker_list.*
diff --git a/egs/sre16/v1/local/make_sre08.pl b/egs/sre16/v1/local/make_sre08.pl
new file mode 100755
index 000000000..e68cc4926
--- /dev/null
+++ b/egs/sre16/v1/local/make_sre08.pl
@@ -0,0 +1,131 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3# Copyright 2017 David Snyder
4# Apache 2.0
5#
6# This script prepares SRE08 test (LDC2011S08) and SRE08 enroll (LDC2011S05)
7# simultaneously in a single data directory.
8
9if (@ARGV != 3) {
10 print STDERR "Usage: $0 <path-to-SRE08-test> <path-to-SRE08-train> <path-to-output>\n";
11 print STDERR "e.g. $0 /export/corpora/LDC/LDC2011S08 /export/corpora/LDC/LDC2011S05 data/\n";
12 exit(1);
13}
14($db_base_test, $db_base_train, $out_dir) = @ARGV;
15
16if (! -d "$db_base_test/data/") {
17 print STDERR "Directory $db_base_test/data/ doesn't exist\n";
18 exit(1);
19}
20
21if (! -d "$db_base_train/data/") {
22 print STDERR "Directory $db_base_train/data/ doesn't exist\n";
23 exit(1);
24}
25
26$out_dir = "$out_dir/sre08";
27$tmp_dir = "$out_dir/tmp";
28if (system("mkdir -p $tmp_dir") != 0) {
29 die "Error making directory $tmp_dir";
30}
31
32if (system("mkdir -p $out_dir") != 0) {
33 print STDERR "Error making directory $out_dir\n";
34 exit(1);
35}
36
37%seg2sph = ();
38open(TRIALS, "<$db_base_test/data/keys/NIST_SRE08_KEYS.v0.1/trial-keys/NIST_SRE08_short2-short3.trial.key") || die "Could not open $db_base_test/data/keys/NIST_SRE08_KEYS.v0.1/trial-keys/NIST_SRE08_short2-short3.trial.key";
39open(MODELS, "<$db_base_test/data/keys/NIST_SRE08_KEYS.v0.1/model-keys/NIST_SRE08_short2.model.key") || die "Could not open $db_base_test/data/keys/NIST_SRE08_KEYS.v0.1/model-keys/NIST_SRE08_short2.model.key";
40open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
41open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
42open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
43
44if (system("find $db_base_test/data/ -name '*.sph' > $tmp_dir/sph.list") != 0) {
45 die "Error getting list of sph files $db_base_test";
46}
47if (system("find $db_base_train/data/ -name '*.sph' >> $tmp_dir/sph.list") != 0) {
48 die "Error getting list of sph files for $db_base_train";
49}
50
51open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
52while(<SPHLIST>) {
53 chomp;
54 $sph = $_;
55 @toks = split("/",$sph);
56 $sph_id = (split("[./]",$toks[$#toks]))[0];
57 $seg2sph{$sph_id} = $sph;
58}
59
60%model2sid = ();
61while (<MODELS>) {
62 chomp;
63 $line = $_;
64 @toks = split(",", $line);
65 $model = $toks[0];
66 $gender = $toks[1];
67 ($seg, $ch) = split("[:]", $toks[2]);
68 $sid = $toks[3];
69 $model2sid{$model} = $sid;
70 print GNDR "$sid $gender\n";
71 if (exists $seg2sph{$seg} and -f $seg2sph{$seg}) {
72 $sph = $seg2sph{$seg};
73 if ($ch eq "a") {
74 $utt = "${sid}_SRE08_${seg}_A";
75 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
76 print SPKR "$utt $sid\n";
77 } elsif($ch eq "b") {
78 $utt = "${sid}_SRE08_${seg}_B";
79 print WAV "$utt"," sph2pipe -f wav -p -c 2 $sph |\n";
80 print SPKR "$utt $sid\n";
81 } else {
82 print STDERR "Malformed trials file\n";
83 exit(1);
84 }
85 }
86}
87
88while (<TRIALS>) {
89 chomp;
90 $line = $_;
91 @toks = split(",", $line);
92 $model = $toks[0];
93 $seg = $toks[1];
94 $ch = $toks[2];
95 $target = $toks[3];
96 if (exists $seg2sph{$seg} and -f $seg2sph{$seg}) {
97 $sph = $seg2sph{$seg};
98 if ($target eq "target" and exists $model2sid{$model}) {
99 $sid = $model2sid{$model};
100 if ($ch eq "a") {
101 $utt = "${sid}_SRE08_${seg}_A";
102 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
103 print SPKR "$utt $sid\n";
104 } elsif($ch eq "b") {
105 $utt = "${sid}_SRE08_${seg}_B";
106 print WAV "$utt"," sph2pipe -f wav -p -c 2 $sph |\n";
107 print SPKR "$utt $sid\n";
108 } else {
109 print STDERR "Malformed trials file\n";
110 exit(1);
111 }
112 }
113 }
114}
115
116close(TRIALS) || die;
117close(MODELS) || die;
118close(GNDR) || die;
119close(SPKR) || die;
120close(WAV) || die;
121
122if (system(
123 "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
124 die "Error creating spk2utt file in directory $out_dir";
125}
126
127system("utils/fix_data_dir.sh $out_dir");
128if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
129 die "Error validating directory $out_dir";
130}
131
diff --git a/egs/sre16/v1/local/make_sre10.pl b/egs/sre16/v1/local/make_sre10.pl
new file mode 100755
index 000000000..eba9f6977
--- /dev/null
+++ b/egs/sre16/v1/local/make_sre10.pl
@@ -0,0 +1,133 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3# Copyright 2017 David Snyder
4# Apache 2.0
5#
6# Prepares NIST SRE10 enroll and test data in a single directory.
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-SRE10-eval> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora5/SRE/SRE2010/eval/ data/\n";
10 exit(1);
11}
12($db_base, $out_dir) = @ARGV;
13
14if (! -d "$db_base/data/") {
15 print STDERR "Directory $db_base/data/ doesn't exist\n";
16 exit(1);
17}
18$out_dir = "$out_dir/sre10";
19$tmp_dir = "$out_dir/tmp";
20if (system("mkdir -p $tmp_dir") != 0) {
21 die "Error making directory $tmp_dir";
22}
23
24if (system("mkdir -p $out_dir") != 0) {
25 print STDERR "Error making directory $out_dir\n";
26 exit(1);
27}
28
29%seg2sph = ();
30open(TRIALS, "<$db_base/keys/coreext-coreext.trialkey.csv") || die "Could not open $db_base/keys/coreext-coreext.trialkey.csv";
31open(TRAIN, "<$db_base/train/coreext.trn") || die "Could not open $db_base/train/coreext.trn";
32open(MODELS, "<$db_base/keys/coreext.modelkey.csv") || die "Could not open $db_base/keys/coreext.modelkey.csv";
33open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
34open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
35open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
36
37if (system("find $db_base/data/ -name '*.sph' > $tmp_dir/sph.list") != 0) {
38 die "Error getting list of sph files";
39}
40open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
41while(<SPHLIST>) {
42 chomp;
43 $sph = $_;
44 @toks = split("/",$sph);
45 $sph_id = (split("[./]",$toks[$#toks]))[0];
46 $seg2sph{$sph_id} = $sph;
47}
48
49%model2sid = ();
50while (<MODELS>) {
51 chomp;
52 $line = $_;
53 ($model, $sid) = split(",", $line);
54 if (not $sid eq "NOT_SCORED") {
55 $model2sid{$model} = $sid;
56 }
57}
58
59while (<TRAIN>) {
60 chomp;
61 $line = $_;
62 @toks = split(" ", $line);
63 $model = $toks[0];
64 $gender = $toks[1];
65 @toks2 = split("/", $toks[2]);
66 ($sph, $ch) = split("[:]", $toks2[$#toks2]);
67 $seg = (split("[./]", $sph))[0];
68 if (exists $seg2sph{$seg}) {
69 $sph = $seg2sph{$seg};
70 if (exists $model2sid{$model}) {
71 $sid = $model2sid{$model};
72 print GNDR "$sid $gender\n";
73 if ($ch eq "A") {
74 $utt = "${sid}_SRE10_${seg}_A";
75 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
76 print SPKR "$utt $sid\n";
77 } elsif($ch eq "B") {
78 $utt = "${sid}_SRE10_${seg}_B";
79 print WAV "$utt"," sph2pipe -f wav -p -c 2 $sph |\n";
80 print SPKR "$utt $sid\n";
81 } else {
82 print STDERR "Malformed train file\n";
83 exit(1);
84 }
85 }
86 }
87}
88
89while (<TRIALS>) {
90 chomp;
91 $line = $_;
92 @toks = split(",", $line);
93 $model = $toks[0];
94 $seg = $toks[1];
95 $ch = $toks[2];
96 $target = $toks[3];
97 if (exists $seg2sph{$seg} and -f $seg2sph{$seg}) {
98 $sph = $seg2sph{$seg};
99 if ($target eq "target" and exists $model2sid{$model}) {
100 $sid = $model2sid{$model};
101 if ($ch eq "a") {
102 $utt = "${sid}_SRE10_${seg}_A";
103 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
104 print SPKR "$utt $sid\n";
105 } elsif($ch eq "b") {
106 $utt = "${sid}_SRE10_${seg}_B";
107 print WAV "$utt"," sph2pipe -f wav -p -c 2 $sph |\n";
108 print SPKR "$utt $sid\n";
109 } else {
110 print STDERR "Malformed trials file\n";
111 exit(1);
112 }
113 }
114 }
115}
116
117close(TRIALS) || die;
118close(TRAIN) || die;
119close(MODELS) || die;
120close(GNDR) || die;
121close(SPKR) || die;
122close(WAV) || die;
123
124if (system(
125 "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
126 die "Error creating spk2utt file in directory $out_dir";
127}
128
129system("utils/fix_data_dir.sh $out_dir");
130if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
131 die "Error validating directory $out_dir";
132}
133
diff --git a/egs/sre16/v1/local/make_sre16_eval.pl b/egs/sre16/v1/local/make_sre16_eval.pl
new file mode 100755
index 000000000..9817a8513
--- /dev/null
+++ b/egs/sre16/v1/local/make_sre16_eval.pl
@@ -0,0 +1,154 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3# Copyright 2017 David Snyder
4# Apache 2.0
5#
6
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-SRE16-eval> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora/SRE/R149_0_1 data/\n";
10 exit(1);
11}
12
13($db_base, $out_dir) = @ARGV;
14
15# Handle enroll
16$out_dir_enroll = "$out_dir/sre16_eval_enroll";
17if (system("mkdir -p $out_dir_enroll")) {
18 die "Error making directory $out_dir_enroll";
19}
20
21$tmp_dir_enroll = "$out_dir_enroll/tmp";
22if (system("mkdir -p $tmp_dir_enroll") != 0) {
23 die "Error making directory $tmp_dir_enroll";
24}
25
26open(SPKR, ">$out_dir_enroll/utt2spk") || die "Could not open the output file $out_dir_enroll/utt2spk";
27open(WAV, ">$out_dir_enroll/wav.scp") || die "Could not open the output file $out_dir_enroll/wav.scp";
28open(META, "<$db_base/docs/sre16_eval_enrollment.tsv") or die "cannot open wav list";
29%utt2fixedutt = ();
30while (<META>) {
31 $line = $_;
32 @toks = split(" ", $line);
33 $spk = $toks[0];
34 $utt = $toks[1];
35 if ($utt ne "segment") {
36 print SPKR "${spk}-${utt} $spk\n";
37 $utt2fixedutt{$utt} = "${spk}-${utt}";
38 }
39}
40
41if (system("find $db_base/data/enrollment/ -name '*.sph' > $tmp_dir_enroll/sph.list") != 0) {
42 die "Error getting list of sph files";
43}
44
45open(WAVLIST, "<$tmp_dir_enroll/sph.list") or die "cannot open wav list";
46
47while(<WAVLIST>) {
48 chomp;
49 $sph = $_;
50 @t = split("/",$sph);
51 @t1 = split("[./]",$t[$#t]);
52 $utt=$utt2fixedutt{$t1[0]};
53 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
54}
55close(WAV) || die;
56close(SPKR) || die;
57
58# Handle test
59$out_dir_test= "$out_dir/sre16_eval_test";
60if (system("mkdir -p $out_dir_test")) {
61 die "Error making directory $out_dir_test";
62}
63
64$tmp_dir_test = "$out_dir_test/tmp";
65if (system("mkdir -p $tmp_dir_test") != 0) {
66 die "Error making directory $tmp_dir_test";
67}
68
69open(SPKR, ">$out_dir_test/utt2spk") || die "Could not open the output file $out_dir_test/utt2spk";
70open(WAV, ">$out_dir_test/wav.scp") || die "Could not open the output file $out_dir_test/wav.scp";
71open(TRIALS, ">$out_dir_test/trials") || die "Could not open the output file $out_dir_test/trials";
72open(TGL_TRIALS, ">$out_dir_test/trials_tgl") || die "Could not open the output file $out_dir_test/trials_tgl";
73open(YUE_TRIALS, ">$out_dir_test/trials_yue") || die "Could not open the output file $out_dir_test/trials_yue";
74
75if (system("find $db_base/data/test/ -name '*.sph' > $tmp_dir_test/sph.list") != 0) {
76 die "Error getting list of sph files";
77}
78
79open(KEY, "<$db_base/docs/sre16_eval_trial_key.tsv") || die "Could not open trials file $db_base/docs/sre16_eval_trial_key.tsv. It might be located somewhere else in your distribution.";
80open(SEG_KEY, "<$db_base/docs/sre16_eval_segment_key.tsv") || die "Could not open trials file $db_base/docs/sre16_eval_segment_key.tsv. It might be located somewhere else in your distribution.";
81open(LANG_KEY, "<$db_base/metadata/calls.tsv") || die "Could not open trials file $db_base/metadata/calls.tsv. It might be located somewhere else in your distribution.";
82open(WAVLIST, "<$tmp_dir_test/sph.list") or die "cannot open wav list";
83
84%utt2call = ();
85while(<SEG_KEY>) {
86 chomp;
87 $line = $_;
88 @toks = split(" ", $line);
89 $utt = $toks[0];
90 $call = $toks[1];
91 if ($utt ne "segment") {
92 $utt2call{$utt} = $call;
93 }
94}
95close(SEG_KEY) || die;
96
97%call2lang = ();
98while(<LANG_KEY>) {
99 chomp;
100 $line = $_;
101 @toks = split(" ", $line);
102 $call = $toks[0];
103 $lang = $toks[1];
104 $call2lang{$call} = $lang;
105}
106close(LANG_KEY) || die;
107
108while(<WAVLIST>) {
109 chomp;
110 $sph = $_;
111 @t = split("/",$sph);
112 @t1 = split("[./]",$t[$#t]);
113 $utt=$t1[0];
114 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
115 print SPKR "$utt $utt\n";
116}
117close(WAV) || die;
118close(SPKR) || die;
119
120while (<KEY>) {
121 $line = $_;
122 @toks = split(" ", $line);
123 $spk = $toks[0];
124 $utt = $toks[1];
125 $call = $utt2call{$utt};
126 $target_type = $toks[3];
127 if ($utt ne "segment") {
128 print TRIALS "${spk} ${utt} ${target_type}\n";
129 if ($call2lang{$call} eq "tgl") {
130 print TGL_TRIALS "${spk} ${utt} ${target_type}\n";
131 } elsif ($call2lang{$call} eq "yue") {
132 print YUE_TRIALS "${spk} ${utt} ${target_type}\n";
133 } else {
134 die "Unexpected language $call2lang{$call} for utterance $utt.";
135 }
136 }
137}
138
139close(TRIALS) || die;
140close(TGL_TRIALS) || die;
141close(YUE_TRIALS) || die;
142
143if (system("utils/utt2spk_to_spk2utt.pl $out_dir_enroll/utt2spk >$out_dir_enroll/spk2utt") != 0) {
144 die "Error creating spk2utt file in directory $out_dir_enroll";
145}
146if (system("utils/utt2spk_to_spk2utt.pl $out_dir_test/utt2spk >$out_dir_test/spk2utt") != 0) {
147 die "Error creating spk2utt file in directory $out_dir_test";
148}
149if (system("utils/fix_data_dir.sh $out_dir_enroll") != 0) {
150 die "Error fixing data dir $out_dir_enroll";
151}
152if (system("utils/fix_data_dir.sh $out_dir_test") != 0) {
153 die "Error fixing data dir $out_dir_test";
154}
diff --git a/egs/sre16/v1/local/make_sre16_unlabeled.pl b/egs/sre16/v1/local/make_sre16_unlabeled.pl
new file mode 100755
index 000000000..2de9d14ab
--- /dev/null
+++ b/egs/sre16/v1/local/make_sre16_unlabeled.pl
@@ -0,0 +1,90 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3# Copyright 2017 David Snyder
4# Apache 2.0
5
6if (@ARGV != 2) {
7 print STDERR "Usage: $0 <path-to-call-my-net-training-data> <path-to-output>\n";
8 print STDERR "e.g. $0 /export/corpora/SRE/LDC2016E46_SRE16_Call_My_Net_Training_Data data/\n";
9 exit(1);
10}
11
12($db_base, $out_dir) = @ARGV;
13
14# Handle major subset.
15$out_dir_major = "$out_dir/sre16_major";
16if (system("mkdir -p $out_dir_major")) {
17 die "Error making directory $out_dir_major";
18}
19
20$tmp_dir_major = "$out_dir_major/tmp";
21if (system("mkdir -p $tmp_dir_major") != 0) {
22 die "Error making directory $tmp_dir_major";
23}
24
25open(SPKR, ">$out_dir_major/utt2spk") || die "Could not open the output file $out_dir_major/utt2spk";
26open(WAV, ">$out_dir_major/wav.scp") || die "Could not open the output file $out_dir_major/wav.scp";
27
28if (system("find $db_base/data/unlabeled/major/ -name '*.sph' > $tmp_dir_major/sph.list") != 0) {
29 die "Error getting list of sph files";
30}
31
32open(WAVLIST, "<$tmp_dir_major/sph.list") or die "cannot open wav list";
33
34while(<WAVLIST>) {
35 chomp;
36 $sph = $_;
37 @t = split("/",$sph);
38 @t1 = split("[./]",$t[$#t]);
39 $utt=$t1[0];
40 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
41 print SPKR "$utt $utt\n";
42}
43
44close(WAV) || die;
45close(SPKR) || die;
46
47# Handle minor subset.
48$out_dir_minor= "$out_dir/sre16_minor";
49if (system("mkdir -p $out_dir_minor")) {
50 die "Error making directory $out_dir_minor";
51}
52
53$tmp_dir_minor = "$out_dir_minor/tmp";
54if (system("mkdir -p $tmp_dir_minor") != 0) {
55 die "Error making directory $tmp_dir_minor";
56}
57
58open(SPKR, ">$out_dir_minor/utt2spk") || die "Could not open the output file $out_dir_minor/utt2spk";
59open(WAV, ">$out_dir_minor/wav.scp") || die "Could not open the output file $out_dir_minor/wav.scp";
60
61if (system("find $db_base/data/unlabeled/minor/ -name '*.sph' > $tmp_dir_minor/sph.list") != 0) {
62 die "Error getting list of sph files";
63}
64
65open(WAVLIST, "<$tmp_dir_minor/sph.list") or die "cannot open wav list";
66
67while(<WAVLIST>) {
68 chomp;
69 $sph = $_;
70 @t = split("/",$sph);
71 @t1 = split("[./]",$t[$#t]);
72 $utt=$t1[0];
73 print WAV "$utt"," sph2pipe -f wav -p -c 1 $sph |\n";
74 print SPKR "$utt $utt\n";
75}
76close(WAV) || die;
77close(SPKR) || die;
78
79if (system("utils/utt2spk_to_spk2utt.pl $out_dir_major/utt2spk >$out_dir_major/spk2utt") != 0) {
80 die "Error creating spk2utt file in directory $out_dir_major";
81}
82if (system("utils/utt2spk_to_spk2utt.pl $out_dir_minor/utt2spk >$out_dir_minor/spk2utt") != 0) {
83 die "Error creating spk2utt file in directory $out_dir_minor";
84}
85if (system("utils/fix_data_dir.sh $out_dir_major") != 0) {
86 die "Error fixing data dir $out_dir_major";
87}
88if (system("utils/fix_data_dir.sh $out_dir_minor") != 0) {
89 die "Error fixing data dir $out_dir_minor";
90}
diff --git a/egs/sre16/v1/local/make_swbd2_phase1.pl b/egs/sre16/v1/local/make_swbd2_phase1.pl
new file mode 100755
index 000000000..71b26b55d
--- /dev/null
+++ b/egs/sre16/v1/local/make_swbd2_phase1.pl
@@ -0,0 +1,106 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3#
4# Copyright 2017 David Snyder
5# Apache 2.0
6
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-LDC98S75> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n";
10 exit(1);
11}
12($db_base, $out_dir) = @ARGV;
13
14if (system("mkdir -p $out_dir")) {
15 die "Error making directory $out_dir";
16}
17
18open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl";
19open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
20open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
21open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
22
23@badAudio = ("3", "4");
24
25$tmp_dir = "$out_dir/tmp";
26if (system("mkdir -p $tmp_dir") != 0) {
27 die "Error making directory $tmp_dir";
28}
29
30if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
31 die "Error getting list of sph files";
32}
33
34open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
35
36%wavs = ();
37while(<WAVLIST>) {
38 chomp;
39 $sph = $_;
40 @t = split("/",$sph);
41 @t1 = split("[./]",$t[$#t]);
42 $uttId = $t1[0];
43 $wavs{$uttId} = $sph;
44}
45
46while (<CS>) {
47 $line = $_ ;
48 @A = split(",", $line);
49 @A1 = split("[./]",$A[0]);
50 $wav = $A1[0];
51 if (/$wav/i ~~ @badAudio) {
52 # do nothing
53 print "Bad Audio = $wav";
54 } else {
55 $spkr1= "sw_" . $A[2];
56 $spkr2= "sw_" . $A[3];
57 $gender1 = $A[5];
58 $gender2 = $A[6];
59 if ($gender1 eq "M") {
60 $gender1 = "m";
61 } elsif ($gender1 eq "F") {
62 $gender1 = "f";
63 } else {
64 die "Unknown Gender in $line";
65 }
66 if ($gender2 eq "M") {
67 $gender2 = "m";
68 } elsif ($gender2 eq "F") {
69 $gender2 = "f";
70 } else {
71 die "Unknown Gender in $line";
72 }
73 if (-e "$wavs{$wav}") {
74 $uttId = $spkr1 ."_" . $wav ."_1";
75 if (!$spk2gender{$spkr1}) {
76 $spk2gender{$spkr1} = $gender1;
77 print GNDR "$spkr1"," $gender1\n";
78 }
79 print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n";
80 print SPKR "$uttId"," $spkr1","\n";
81
82 $uttId = $spkr2 . "_" . $wav ."_2";
83 if (!$spk2gender{$spkr2}) {
84 $spk2gender{$spkr2} = $gender2;
85 print GNDR "$spkr2"," $gender2\n";
86 }
87 print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n";
88 print SPKR "$uttId"," $spkr2","\n";
89 } else {
90 print STDERR "Missing $wavs{$wav} for $wav\n";
91 }
92 }
93}
94
95close(WAV) || die;
96close(SPKR) || die;
97close(GNDR) || die;
98if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
99 die "Error creating spk2utt file in directory $out_dir";
100}
101if (system("utils/fix_data_dir.sh $out_dir") != 0) {
102 die "Error fixing data dir $out_dir";
103}
104if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
105 die "Error validating directory $out_dir";
106}
diff --git a/egs/sre16/v1/local/make_swbd2_phase2.pl b/egs/sre16/v1/local/make_swbd2_phase2.pl
new file mode 100755
index 000000000..05b2b1fc7
--- /dev/null
+++ b/egs/sre16/v1/local/make_swbd2_phase2.pl
@@ -0,0 +1,107 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3#
4# Copyright 2013 Daniel Povey
5# Apache 2.0
6
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-LDC99S79> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora5/LDC/LDC99S79 data/swbd2_phase2_train\n";
10 exit(1);
11}
12($db_base, $out_dir) = @ARGV;
13
14if (system("mkdir -p $out_dir")) {
15 die "Error making directory $out_dir";
16}
17
18open(CS, "<$db_base/DISC1/doc/callstat.tbl") || die "Could not open $db_base/DISC1/doc/callstat.tbl";
19open(CI, "<$db_base/DISC1/doc/callinfo.tbl") || die "Could not open $db_base/DISC1/doc/callinfo.tbl";
20open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
21open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
22open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
23
24@badAudio = ("3", "4");
25
26$tmp_dir = "$out_base/tmp";
27if (system("mkdir -p $tmp_dir") != 0) {
28 die "Error making directory $tmp_dir";
29}
30
31if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
32 die "Error getting list of sph files";
33}
34
35open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
36
37while(<WAVLIST>) {
38 chomp;
39 $sph = $_;
40 @t = split("/",$sph);
41 @t1 = split("[./]",$t[$#t]);
42 $uttId=$t1[0];
43 $wav{$uttId} = $sph;
44}
45
46while (<CS>) {
47 $line = $_ ;
48 $ci = <CI>;
49 $ci = <CI>;
50 @ci = split(",",$ci);
51 $wav = $ci[0];
52 @A = split(",", $line);
53 if (/$wav/i ~~ @badAudio) {
54 # do nothing
55 } else {
56 $spkr1= "sw_" . $A[2];
57 $spkr2= "sw_" . $A[3];
58 $gender1 = $A[4];
59 $gender2 = $A[5];
60 if ($gender1 eq "M") {
61 $gender1 = "m";
62 } elsif ($gender1 eq "F") {
63 $gender1 = "f";
64 } else {
65 die "Unknown Gender in $line";
66 }
67 if ($gender2 eq "M") {
68 $gender2 = "m";
69 } elsif ($gender2 eq "F") {
70 $gender2 = "f";
71 } else {
72 die "Unknown Gender in $line";
73 }
74 if (-e "$wav{$wav}") {
75 $uttId = $spkr1 ."_" . $wav ."_1";
76 if (!$spk2gender{$spkr1}) {
77 $spk2gender{$spkr1} = $gender1;
78 print GNDR "$spkr1"," $gender1\n";
79 }
80 print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n";
81 print SPKR "$uttId"," $spkr1","\n";
82
83 $uttId = $spkr2 . "_" . $wav ."_2";
84 if (!$spk2gender{$spkr2}) {
85 $spk2gender{$spkr2} = $gender2;
86 print GNDR "$spkr2"," $gender2\n";
87 }
88 print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n";
89 print SPKR "$uttId"," $spkr2","\n";
90 } else {
91 print STDERR "Missing $wav{$wav} for $wav\n";
92 }
93 }
94}
95
96close(WAV) || die;
97close(SPKR) || die;
98close(GNDR) || die;
99if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
100 die "Error creating spk2utt file in directory $out_dir";
101}
102if (system("utils/fix_data_dir.sh $out_dir") != 0) {
103 die "Error fixing data dir $out_dir";
104}
105if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
106 die "Error validating directory $out_dir";
107}
diff --git a/egs/sre16/v1/local/make_swbd2_phase3.pl b/egs/sre16/v1/local/make_swbd2_phase3.pl
new file mode 100755
index 000000000..ca70df32e
--- /dev/null
+++ b/egs/sre16/v1/local/make_swbd2_phase3.pl
@@ -0,0 +1,102 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3#
4# Copyright 2013 Daniel Povey
5# Apache 2.0
6
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-LDC2002S06> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora5/LDC/LDC2002S06 data/swbd2_phase3_train\n";
10 exit(1);
11}
12($db_base, $out_dir) = @ARGV;
13
14if (system("mkdir -p $out_dir")) {
15 die "Error making directory $out_dir";
16}
17
18open(CS, "<$db_base/DISC1/docs/callstat.tbl") || die "Could not open $db_base/DISC1/docs/callstat.tbl";
19open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
20open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
21open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
22
23@badAudio = ("3", "4");
24
25$tmp_dir = "$out_base/tmp";
26if (system("mkdir -p $tmp_dir") != 0) {
27 die "Error making directory $tmp_dir";
28}
29
30if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
31 die "Error getting list of sph files";
32}
33
34open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
35while(<WAVLIST>) {
36 chomp;
37 $sph = $_;
38 @t = split("/",$sph);
39 @t1 = split("[./]",$t[$#t]);
40 $uttId=$t1[0];
41 $wav{$uttId} = $sph;
42}
43
44while (<CS>) {
45 $line = $_ ;
46 @A = split(",", $line);
47 $wav = "sw_" . $A[0] ;
48 if (/$wav/i ~~ @badAudio) {
49 # do nothing
50 } else {
51 $spkr1= "sw_" . $A[3];
52 $spkr2= "sw_" . $A[4];
53 $gender1 = $A[5];
54 $gender2 = $A[6];
55 if ($gender1 eq "M") {
56 $gender1 = "m";
57 } elsif ($gender1 eq "F") {
58 $gender1 = "f";
59 } else {
60 die "Unknown Gender in $line";
61 }
62 if ($gender2 eq "M") {
63 $gender2 = "m";
64 } elsif ($gender2 eq "F") {
65 $gender2 = "f";
66 } else {
67 die "Unknown Gender in $line";
68 }
69 if (-e "$wav{$wav}") {
70 $uttId = $spkr1 ."_" . $wav ."_1";
71 if (!$spk2gender{$spkr1}) {
72 $spk2gender{$spkr1} = $gender1;
73 print GNDR "$spkr1"," $gender1\n";
74 }
75 print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$wav} |\n";
76 print SPKR "$uttId"," $spkr1","\n";
77
78 $uttId = $spkr2 . "_" . $wav ."_2";
79 if (!$spk2gender{$spkr2}) {
80 $spk2gender{$spkr2} = $gender2;
81 print GNDR "$spkr2"," $gender2\n";
82 }
83 print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wav{$wav} |\n";
84 print SPKR "$uttId"," $spkr2","\n";
85 } else {
86 print STDERR "Missing $wav{$wav} for $wav\n";
87 }
88 }
89}
90
91close(WAV) || die;
92close(SPKR) || die;
93close(GNDR) || die;
94if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
95 die "Error creating spk2utt file in directory $out_dir";
96}
97if (system("utils/fix_data_dir.sh $out_dir") != 0) {
98 die "Error fixing data dir $out_dir";
99}
100if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
101 die "Error validating directory $out_dir";
102}
diff --git a/egs/sre16/v1/local/make_swbd_cellular1.pl b/egs/sre16/v1/local/make_swbd_cellular1.pl
new file mode 100755
index 000000000..e30c710e6
--- /dev/null
+++ b/egs/sre16/v1/local/make_swbd_cellular1.pl
@@ -0,0 +1,83 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3#
4# Copyright 2013 Daniel Povey
5# Apache 2.0
6
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-LDC2001S13> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora5/LDC/LDC2001S13 data/swbd_cellular1_train\n";
10 exit(1);
11}
12($db_base, $out_dir) = @ARGV;
13
14if (system("mkdir -p $out_dir")) {
15 die "Error making directory $out_dir";
16}
17
18open(CS, "<$db_base/doc/swb_callstats.tbl") || die "Could not open $db_base/doc/swb_callstats.tbl";
19open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
20open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
21open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
22
23@badAudio = ("40019", "45024", "40022");
24
25while (<CS>) {
26 $line = $_ ;
27 @A = split(",", $line);
28 if (/$A[0]/i ~~ @badAudio) {
29 # do nothing
30 } else {
31 $wav = "sw_" . $A[0];
32 $spkr1= "sw_" . $A[1];
33 $spkr2= "sw_" . $A[2];
34 $gender1 = $A[3];
35 $gender2 = $A[4];
36 if ($A[3] eq "M") {
37 $gender1 = "m";
38 } elsif ($A[3] eq "F") {
39 $gender1 = "f";
40 } else {
41 die "Unknown Gender in $line";
42 }
43 if ($A[4] eq "M") {
44 $gender2 = "m";
45 } elsif ($A[4] eq "F") {
46 $gender2 = "f";
47 } else {
48 die "Unknown Gender in $line";
49 }
50 if (-e "$db_base/$wav.sph") {
51 $uttId = $spkr1 . "-swbdc_" . $wav ."_1";
52 if (!$spk2gender{$spkr1}) {
53 $spk2gender{$spkr1} = $gender1;
54 print GNDR "$spkr1"," $gender1\n";
55 }
56 print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/$wav.sph |\n";
57 print SPKR "$uttId"," $spkr1","\n";
58
59 $uttId = $spkr2 . "-swbdc_" . $wav ."_2";
60 if (!$spk2gender{$spkr2}) {
61 $spk2gender{$spkr2} = $gender2;
62 print GNDR "$spkr2"," $gender2\n";
63 }
64 print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/$wav.sph |\n";
65 print SPKR "$uttId"," $spkr2","\n";
66 } else {
67 print STDERR "Missing $db_base/$wav.sph\n";
68 }
69 }
70}
71
72close(WAV) || die;
73close(SPKR) || die;
74close(GNDR) || die;
75if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
76 die "Error creating spk2utt file in directory $out_dir";
77}
78if (system("utils/fix_data_dir.sh $out_dir") != 0) {
79 die "Error fixing data dir $out_dir";
80}
81if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
82 die "Error validating directory $out_dir";
83}
diff --git a/egs/sre16/v1/local/make_swbd_cellular2.pl b/egs/sre16/v1/local/make_swbd_cellular2.pl
new file mode 100755
index 000000000..4de954c19
--- /dev/null
+++ b/egs/sre16/v1/local/make_swbd_cellular2.pl
@@ -0,0 +1,83 @@
1#!/usr/bin/perl
2use warnings; #sed replacement for -w perl parameter
3#
4# Copyright 2013 Daniel Povey
5# Apache 2.0
6
7if (@ARGV != 2) {
8 print STDERR "Usage: $0 <path-to-LDC2004S07> <path-to-output>\n";
9 print STDERR "e.g. $0 /export/corpora5/LDC/LDC2004S07 data/swbd_cellular2_train\n";
10 exit(1);
11}
12($db_base, $out_dir) = @ARGV;
13
14if (system("mkdir -p $out_dir")) {
15 die "Error making directory $out_dir";
16}
17
18open(CS, "<$db_base/docs/swb_callstats.tbl") || die "Could not open $db_base/docs/swb_callstats.tbl";
19open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
20open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
21open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
22
23@badAudio=("45024", "40022");
24
25while (<CS>) {
26 $line = $_ ;
27 @A = split(",", $line);
28 if (/$A[0]/i ~~ @badAudio) {
29 # do nothing
30 } else {
31 $wav = "sw_" . $A[0];
32 $spkr1= "sw_" . $A[1];
33 $spkr2= "sw_" . $A[2];
34 $gender1 = $A[3];
35 $gender2 = $A[4];
36 if ($A[3] eq "M") {
37 $gender1 = "m";
38 } elsif ($A[3] eq "F") {
39 $gender1 = "f";
40 } else {
41 die "Unknown Gender in $line";
42 }
43 if ($A[4] eq "M") {
44 $gender2 = "m";
45 } elsif ($A[4] eq "F") {
46 $gender2 = "f";
47 } else {
48 die "Unknown Gender in $line";
49 }
50 if (-e "$db_base/data/$wav.sph") {
51 $uttId = $spkr1 . "-swbdc_" . $wav ."_1";
52 if (!$spk2gender{$spkr1}) {
53 $spk2gender{$spkr1} = $gender1;
54 print GNDR "$spkr1"," $gender1\n";
55 }
56 print WAV "$uttId"," sph2pipe -f wav -p -c 1 $db_base/data/$wav.sph |\n";
57 print SPKR "$uttId"," $spkr1","\n";
58
59 $uttId = $spkr2 . "-swbdc_" . $wav ."_2";
60 if (!$spk2gender{$spkr2}) {
61 $spk2gender{$spkr2} = $gender2;
62 print GNDR "$spkr2"," $gender2\n";
63 }
64 print WAV "$uttId"," sph2pipe -f wav -p -c 2 $db_base/data/$wav.sph |\n";
65 print SPKR "$uttId"," $spkr2","\n";
66 } else {
67 print STDERR "Missing $db_base/data/$wav.sph\n";
68 }
69 }
70}
71
72close(WAV) || die;
73close(SPKR) || die;
74close(GNDR) || die;
75if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
76 die "Error creating spk2utt file in directory $out_dir";
77}
78if (system("utils/fix_data_dir.sh $out_dir") != 0) {
79 die "Error fixing data dir $out_dir";
80}
81if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
82 die "Error validating directory $out_dir";
83}
diff --git a/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh
new file mode 100755
index 000000000..9f132bdbd
--- /dev/null
+++ b/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh
@@ -0,0 +1,70 @@
1#!/bin/bash
2#
3# Apache 2.0.
4
5# This script applies sliding window cmvn and removes silence frames. This
6# is performed on the raw features prior to generating examples for training
7# the xvector system.
8
9nj=40
10cmd="run.pl"
11stage=0
12norm_vars=false
13center=true
14compress=true
15cmn_window=300
16
17echo "$0 $@" # Print the command line for logging
18
19if [ -f path.sh ]; then . ./path.sh; fi
20. parse_options.sh || exit 1;
21if [ $# != 3 ]; then
22 echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
23 echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
24 echo "Options: "
25 echo " --nj <nj> # number of parallel jobs"
26 echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
27 echo " --norm-vars <true|false> # If true, normalize variances in the sliding window cmvn"
28 exit 1;
29fi
30
31data_in=$1
32data_out=$2
33dir=$3
34
35name=`basename $data_in`
36
37for f in $data_in/feats.scp $data_in/vad.scp ; do
38 [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
39done
40
41# Set various variables.
42mkdir -p $dir/log
43mkdir -p $data_out
44featdir=${PWD}/$dir
45
46cp $data_in/utt2spk $data_out/utt2spk
47cp $data_in/spk2utt $data_out/spk2utt
48cp $data_in/wav.scp $data_out/wav.scp
49
50for n in $(seq $nj); do
51 # the next command does nothing unless $featdir/storage/ exists, see
52 # utils/create_data_link.pl for more info.
53 utils/create_data_link.pl $featdir/xvector_feats_${name}.$n.ark
54done
55
56sdata_in=$data_in/split$nj;
57utils/split_data.sh $data_in $nj || exit 1;
58
59$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \
60 apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
61 scp:${sdata_in}/JOB/feats.scp ark:- \| \
62 select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \
63 copy-feats --compress=$compress ark:- \
64 ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1;
65
66for n in $(seq $nj); do
67 cat $featdir/xvector_feats_${name}.$n.scp || exit 1;
68done > ${data_out}/feats.scp || exit 1
69
70echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/sre16/v1/local/nnet3/xvector/run_xvector.sh b/egs/sre16/v1/local/nnet3/xvector/run_xvector.sh
new file mode 120000
index 000000000..585b63fd2
--- /dev/null
+++ b/egs/sre16/v1/local/nnet3/xvector/run_xvector.sh
@@ -0,0 +1 @@
tuning/run_xvector_1a.sh \ No newline at end of file
diff --git a/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh
new file mode 100755
index 000000000..6e87b30f2
--- /dev/null
+++ b/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh
@@ -0,0 +1,152 @@
1#!/bin/bash
2# Copyright 2017 David Snyder
3# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
4# 2017 Johns Hopkins University (Author: Daniel Povey)
5# Apache 2.0.
6
7# This script trains a DNN similar to the recipe described in
8# http://www.danielpovey.com/files/2017_interspeech_embeddings.pdf .
9
10. ./cmd.sh
11set -e
12
13stage=1
14train_stage=0
15use_gpu=true
16remove_egs=false
17
18data=data/train
19nnet_dir=exp/xvector_nnet_1a/
20egs_dir=exp/xvector_nnet_1a/egs
21
22. ./path.sh
23. ./cmd.sh
24. ./utils/parse_options.sh
25
26num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
27
28# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh.
29# The argument --num-repeats is related to the number of times a speaker
30# repeats per archive. If it seems like you're getting too many archives
31# (e.g., more than 200) try increasing the --frames-per-iter option. The
32# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the
33# minimum and maximum length (in terms of number of frames) of the features
34# in the examples.
35#
36# To make sense of the egs script, it may be necessary to put an "exit 1"
37# command immediately after stage 3. Then, inspect
38# exp/<your-dir>/egs/temp/ranges.* . The ranges files specify the examples that
39# will be created, and which archives they will be stored in. Each line of
40# ranges.* has the following form:
41# <utt-id> <local-ark-indx> <global-ark-indx> <start-frame> <end-frame> <spk-id>
42# For example:
43# 100304-f-sre2006-kacg-A 1 2 4079 881 23
44
45# If you're satisfied with the number of archives (e.g., 50-150 archives is
46# reasonable) and with the number of examples per speaker (e.g., 1000-5000
47# is reasonable) then you can let the script continue to the later stages.
48# Otherwise, try increasing or decreasing the --num-repeats option. You might
49# need to fiddle with --frames-per-iter. Increasing this value decreases the
50# the number of archives and increases the number of examples per archive.
51# Decreasing this value increases the number of archives, while decreasing the
52# number of examples per archive.
53if [ $stage -le 4 ]; then
54 echo "$0: Getting neural network training egs";
55 # dump egs.
56 if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
57 utils/create_split_dir.pl \
58 /export/b{03,04,05,06}/$USER/kaldi-data/egs/sre16/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
59 fi
60 sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
61 --nj 8 \
62 --stage 0 \
63 --frames-per-iter 1000000000 \
64 --frames-per-iter-diagnostic 100000 \
65 --min-frames-per-chunk 200 \
66 --max-frames-per-chunk 400 \
67 --num-diagnostic-archives 3 \
68 --num-repeats 35 \
69 "$data" $egs_dir
70fi
71
72if [ $stage -le 5 ]; then
73 echo "$0: creating neural net configs using the xconfig parser";
74 num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}')
75 feat_dim=$(cat $egs_dir/info/feat_dim)
76
77 # This chunk-size corresponds to the maximum number of frames the
78 # stats layer is able to pool over. In this script, it corresponds
79 # to 100 seconds. If the input recording is greater than 100 seconds,
80 # we will compute multiple xvectors from the same recording and average
81 # to produce the final xvector.
82 max_chunk_size=10000
83
84 # The smallest number of frames we're comfortable computing an xvector from.
85 # Note that the hard minimum is given by the left and right context of the
86 # frame-level layers.
87 min_chunk_size=25
88 mkdir -p $nnet_dir/configs
89 cat <<EOF > $nnet_dir/configs/network.xconfig
90 # please note that it is important to have input layer with the name=input
91
92 # The frame-level layers
93 input dim=${feat_dim} name=input
94 relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
95 relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
96 relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
97 relu-batchnorm-layer name=tdnn4 dim=512
98 relu-batchnorm-layer name=tdnn5 dim=1500
99
100 # The stats pooling layer. Layers after this are segment-level.
101 # In the config below, the first and last argument (0, and ${max_chunk_size})
102 # means that we pool over an input segment starting at frame 0
103 # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1)
104 # mean that no subsampling is performed.
105 stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
106
107 # This is where we usually extract the embedding (aka xvector) from.
108 relu-batchnorm-layer name=tdnn6 dim=512 input=stats
109
110 # This is where another layer the embedding could be extracted
111 # from, but usually the previous one works better.
112 relu-batchnorm-layer name=tdnn7 dim=512
113 output-layer name=output include-log-softmax=true dim=${num_targets}
114EOF
115
116 steps/nnet3/xconfig_to_configs.py \
117 --xconfig-file $nnet_dir/configs/network.xconfig \
118 --config-dir $nnet_dir/configs/
119 cp $nnet_dir/configs/final.config $nnet_dir/nnet.config
120
121 # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh
122 echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config
123 echo "$max_chunk_size" > $nnet_dir/max_chunk_size
124 echo "$min_chunk_size" > $nnet_dir/min_chunk_size
125fi
126
127dropout_schedule='0,0@0.20,0.1@0.50,0'
128srand=123
129if [ $stage -le 6 ]; then
130 steps/nnet3/train_raw_dnn.py --stage=$train_stage \
131 --cmd="$train_cmd" \
132 --trainer.optimization.proportional-shrink 10 \
133 --trainer.optimization.momentum=0.5 \
134 --trainer.optimization.num-jobs-initial=3 \
135 --trainer.optimization.num-jobs-final=8 \
136 --trainer.optimization.initial-effective-lrate=0.001 \
137 --trainer.optimization.final-effective-lrate=0.0001 \
138 --trainer.optimization.minibatch-size=64 \
139 --trainer.srand=$srand \
140 --trainer.max-param-change=2 \
141 --trainer.num-epochs=3 \
142 --trainer.dropout-schedule="$dropout_schedule" \
143 --trainer.shuffle-buffer-size=1000 \
144 --egs.frames-per-eg=1 \
145 --egs.dir="$egs_dir" \
146 --cleanup.remove-egs $remove_egs \
147 --cleanup.preserve-model-interval=10 \
148 --use-gpu=true \
149 --dir=$nnet_dir || exit 1;
150fi
151
152exit 0;
diff --git a/egs/sre16/v1/path.sh b/egs/sre16/v1/path.sh
new file mode 100755
index 000000000..e50f57c52
--- /dev/null
+++ b/egs/sre16/v1/path.sh
@@ -0,0 +1,5 @@
1export KALDI_ROOT=`pwd`/../../..
2export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
3[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
4. $KALDI_ROOT/tools/config/common_path.sh
5export LC_ALL=C
diff --git a/egs/sre16/v1/run.sh b/egs/sre16/v1/run.sh
new file mode 100755
index 000000000..3ab81d2df
--- /dev/null
+++ b/egs/sre16/v1/run.sh
@@ -0,0 +1,289 @@
1#!/bin/bash
2# Copyright 2017 David Snyder
3# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
4# 2017 Johns Hopkins University (Author: Daniel Povey)
5# Apache 2.0.
6#
7# See README.txt for more info on data required.
8# Results (mostly EERs) are inline in comments below.
9#
10# This example demonstrates a "bare bones" NIST SRE 2016 recipe using ivectors.
11# In the future, we will add score-normalization and a more effective form of
12# PLDA domain adaptation.
13
14. cmd.sh
15. path.sh
16set -e
17mfccdir=`pwd`/mfcc
18vaddir=`pwd`/mfcc
19
20# SRE16 trials
21sre16_trials=data/sre16_eval_test/trials
22sre16_trials_tgl=data/sre16_eval_test/trials_tgl
23sre16_trials_yue=data/sre16_eval_test/trials_yue
24
25stage=0
26if [ $stage -le 0 ]; then
27 # Path to some, but not all of the training corpora
28 data_root=/export/corpora/LDC
29
30 # Prepare telephone and microphone speech from Mixer6.
31 local/make_mx6.sh $data_root/LDC2013S03 data/
32
33 # Prepare SRE10 test and enroll. Includes microphone interview speech.
34 # NOTE: This corpus is now available through the LDC as LDC2017S06.
35 local/make_sre10.pl /export/corpora5/SRE/SRE2010/eval/ data/
36
37 # Prepare SRE08 test and enroll. Includes some microphone speech.
38 local/make_sre08.pl $data_root/LDC2011S08 $data_root/LDC2011S05 data/
39
40 # This prepares the older NIST SREs from 2004-2006.
41 local/make_sre.sh $data_root data/
42
43 # Combine all SREs prior to 2016 and Mixer6 into one dataset
44 utils/combine_data.sh data/sre \
45 data/sre2004 data/sre2005_train \
46 data/sre2005_test data/sre2006_train \
47 data/sre2006_test_1 data/sre2006_test_2 \
48 data/sre08 data/mx6 data/sre10
49 utils/validate_data_dir.sh --no-text --no-feats data/sre
50 utils/fix_data_dir.sh data/sre
51
52 # Prepare SWBD corpora.
53 local/make_swbd_cellular1.pl $data_root/LDC2001S13 \
54 data/swbd_cellular1_train
55 local/make_swbd_cellular2.pl /export/corpora5/LDC/LDC2004S07 \
56 data/swbd_cellular2_train
57 local/make_swbd2_phase1.pl $data_root/LDC98S75 \
58 data/swbd2_phase1_train
59 local/make_swbd2_phase2.pl /export/corpora5/LDC/LDC99S79 \
60 data/swbd2_phase2_train
61 local/make_swbd2_phase3.pl /export/corpora5/LDC/LDC2002S06 \
62 data/swbd2_phase3_train
63
64 # Combine all SWB corpora into one dataset.
65 utils/combine_data.sh data/swbd \
66 data/swbd_cellular1_train data/swbd_cellular2_train \
67 data/swbd2_phase1_train data/swbd2_phase2_train data/swbd2_phase3_train
68
69 # Prepare NIST SRE 2016 evaluation data.
70 local/make_sre16_eval.pl /export/corpora5/SRE/R149_0_1 data
71
72 # Prepare unlabeled Cantonese and Tagalog development data. This dataset
73 # was distributed to SRE participants.
74 local/make_sre16_unlabeled.pl /export/corpora5/SRE/LDC2016E46_SRE16_Call_My_Net_Training_Data data
75fi
76
77if [ $stage -le 1 ]; then
78 # Make MFCCs and compute the energy-based VAD for each dataset
79 for name in sre swbd sre16_eval_enroll sre16_eval_test sre16_major; do
80 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
81 data/${name} exp/make_mfcc $mfccdir
82 utils/fix_data_dir.sh data/${name}
83 sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
84 data/${name} exp/make_vad $vaddir
85 utils/fix_data_dir.sh data/${name}
86 done
87fi
88
89if [ $stage -le 2 ]; then
90 # Train the UBM.
91 sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
92 --nj 40 --num-threads 8 --subsample 1 \
93 data/sre16_major 2048 \
94 exp/diag_ubm
95
96 sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \
97 --nj 40 --remove-low-count-gaussians false --subsample 1 \
98 data/sre16_major \
99 exp/diag_ubm exp/full_ubm
100fi
101
102if [ $stage -le 3 ]; then
103 # Train the i-vector extractor.
104 utils/combine_data.sh data/swbd_sre data/swbd data/sre
105 sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
106 --ivector-dim 600 \
107 --num-iters 5 \
108 exp/full_ubm/final.ubm data/swbd_sre \
109 exp/extractor
110fi
111
112# In this section, we augment the SRE data with reverberation,
113# noise, music, and babble, and combined it with the clean SRE
114# data. The combined list will be used to train the PLDA model.
115if [ $stage -le 4 ]; then
116 utils/data/get_utt2num_frames.sh --nj 40 --cmd "$train_cmd" data/sre
117 frame_shift=0.01
118 awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/sre/utt2num_frames > data/sre/reco2dur
119
120 if [ ! -d "RIRS_NOISES" ]; then
121 # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
122 wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
123 unzip rirs_noises.zip
124 fi
125
126 # Make a version with reverberated speech
127 rvb_opts=()
128 rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
129 rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
130
131 # Make a reverberated version of the SRE list. Note that we don't add any
132 # additive noise here.
133 python steps/data/reverberate_data_dir.py \
134 "${rvb_opts[@]}" \
135 --speech-rvb-probability 1 \
136 --pointsource-noise-addition-probability 0 \
137 --isotropic-noise-addition-probability 0 \
138 --num-replications 1 \
139 --source-sampling-rate 8000 \
140 data/sre data/sre_reverb
141 cp data/sre/vad.scp data/sre_reverb/
142 utils/copy_data_dir.sh --utt-suffix "-reverb" data/sre_reverb data/sre_reverb.new
143 rm -rf data/sre_reverb
144 mv data/sre_reverb.new data/sre_reverb
145
146 # Prepare the MUSAN corpus, which consists of music, speech, and noise
147 # suitable for augmentation.
148 local/make_musan.sh /export/corpora/JHU/musan data
149
150 # Get the duration of the MUSAN recordings. This will be used by the
151 # script augment_data_dir.py.
152 for name in speech noise music; do
153 utils/data/get_utt2dur.sh data/musan_${name}
154 mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
155 done
156
157 # Augment with musan_noise
158 python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/sre data/sre_noise
159 # Augment with musan_music
160 python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/sre data/sre_music
161 # Augment with musan_speech
162 python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/sre data/sre_babble
163
164 # Combine reverb, noise, music, and babble into one directory.
165 utils/combine_data.sh data/sre_aug data/sre_reverb data/sre_noise data/sre_music data/sre_babble
166
167 # Take a random subset of the augmentations (64k is roughly the size of the SRE dataset)
168 utils/subset_data_dir.sh data/sre_aug 64000 data/sre_aug_64k
169 utils/fix_data_dir.sh data/sre_aug_64k
170
171 # Make MFCCs for the augmented data. Note that we want we should alreay have the vad.scp
172 # from the clean version at this point, which is identical to the clean version!
173 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
174 data/sre_aug_64k exp/make_mfcc $mfccdir
175
176 # Combine the clean and augmented SRE list. This is now roughly
177 # double the size of the original clean list.
178 utils/combine_data.sh data/sre_combined data/sre_aug_64k data/sre
179fi
180
181if [ $stage -le 5 ]; then
182 # Extract i-vectors for SRE data (includes Mixer 6). We'll use this for
183 # things like LDA or PLDA.
184 sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
185 exp/extractor data/sre_combined \
186 exp/ivectors_sre_combined
187
188 # The SRE16 major is an unlabeled dataset consisting of Cantonese and
189 # and Tagalog. This is useful for things like centering, whitening and
190 # score normalization.
191 sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
192 exp/extractor data/sre16_major \
193 exp/ivectors_sre16_major
194
195 # The SRE16 test data
196 sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
197 exp/extractor data/sre16_eval_test \
198 exp/ivectors_sre16_eval_test
199
200 # The SRE16 enroll data
201 sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
202 exp/extractor data/sre16_eval_enroll \
203 exp/ivectors_sre16_eval_enroll
204fi
205
206if [ $stage -le 6 ]; then
207 # Compute the mean vector for centering the evaluation i-vectors.
208 $train_cmd exp/ivectors_sre16_major/log/compute_mean.log \
209 ivector-mean scp:exp/ivectors_sre16_major/ivector.scp \
210 exp/ivectors_sre16_major/mean.vec || exit 1;
211
212 # This script uses LDA to decrease the dimensionality prior to PLDA.
213 lda_dim=200
214 $train_cmd exp/ivectors_sre_combined/log/lda.log \
215 ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
216 "ark:ivector-subtract-global-mean scp:exp/ivectors_sre_combined/ivector.scp ark:- |" \
217 ark:data/sre_combined/utt2spk exp/ivectors_sre_combined/transform.mat || exit 1;
218
219 # Train the PLDA model.
220 $train_cmd exp/ivectors_sre_combined/log/plda.log \
221 ivector-compute-plda ark:data/sre_combined/spk2utt \
222 "ark:ivector-subtract-global-mean scp:exp/ivectors_sre_combined/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
223 exp/ivectors_sre_combined/plda || exit 1;
224
225 # Here we adapt the out-of-domain PLDA model to SRE16 major, a pile
226 # of unlabeled in-domain data. In the future, we will include a clustering
227 # based approach for domain adaptation.
228 $train_cmd exp/ivectors_sre16_major/log/plda_adapt.log \
229 ivector-adapt-plda --within-covar-scale=0.75 --between-covar-scale=0.25 \
230 exp/ivectors_sre_combined/plda \
231 "ark:ivector-subtract-global-mean scp:exp/ivectors_sre16_major/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
232 exp/ivectors_sre16_major/plda_adapt || exit 1;
233fi
234
235if [ $stage -le 7 ]; then
236 # Get results using the out-of-domain PLDA model
237 $train_cmd exp/scores/log/sre16_eval_scoring.log \
238 ivector-plda-scoring --normalize-length=true \
239 --num-utts=ark:exp/ivectors_sre16_eval_enroll/num_utts.ark \
240 "ivector-copy-plda --smoothing=0.0 exp/ivectors_sre_combined/plda - |" \
241 "ark:ivector-mean ark:data/sre16_eval_enroll/spk2utt scp:exp/ivectors_sre16_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec ark:- ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
242 "ark:ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec scp:exp/ivectors_sre16_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
243 "cat '$sre16_trials' | cut -d\ --fields=1,2 |" exp/scores/sre16_eval_scores || exit 1;
244
245 utils/filter_scp.pl $sre16_trials_tgl exp/scores/sre16_eval_scores > exp/scores/sre16_eval_tgl_scores
246 utils/filter_scp.pl $sre16_trials_yue exp/scores/sre16_eval_scores > exp/scores/sre16_eval_yue_scores
247 pooled_eer=$(paste $sre16_trials exp/scores/sre16_eval_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
248 tgl_eer=$(paste $sre16_trials_tgl exp/scores/sre16_eval_tgl_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
249 yue_eer=$(paste $sre16_trials_yue exp/scores/sre16_eval_yue_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
250 echo "Using Out-of-Domain PLDA, EER: Pooled ${pooled_eer}%, Tagalog ${tgl_eer}%, Cantonese ${yue_eer}%"
251 # EER: Pooled 13.65%, Tagalog 17.73%, Cantonese 9.612%
252fi
253
254if [ $stage -le 8 ]; then
255 # Get results using an adapted PLDA model. In the future we'll replace
256 # this (or add to this) with a clustering based approach to PLDA adaptation.
257 $train_cmd exp/scores/log/sre16_eval_scoring_adapt.log \
258 ivector-plda-scoring --normalize-length=true \
259 --num-utts=ark:exp/ivectors_sre16_eval_enroll/num_utts.ark \
260 "ivector-copy-plda --smoothing=0.0 exp/ivectors_sre16_major/plda_adapt - |" \
261 "ark:ivector-mean ark:data/sre16_eval_enroll/spk2utt scp:exp/ivectors_sre16_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec ark:- ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
262 "ark:ivector-subtract-global-mean exp/ivectors_sre16_major/mean.vec scp:exp/ivectors_sre16_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
263 "cat '$sre16_trials' | cut -d\ --fields=1,2 |" exp/scores/sre16_eval_scores_adapt || exit 1;
264
265 utils/filter_scp.pl $sre16_trials_tgl exp/scores/sre16_eval_scores_adapt > exp/scores/sre16_eval_tgl_scores_adapt
266 utils/filter_scp.pl $sre16_trials_yue exp/scores/sre16_eval_scores_adapt > exp/scores/sre16_eval_yue_scores_adapt
267 pooled_eer=$(paste $sre16_trials exp/scores/sre16_eval_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
268 tgl_eer=$(paste $sre16_trials_tgl exp/scores/sre16_eval_tgl_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
269 yue_eer=$(paste $sre16_trials_yue exp/scores/sre16_eval_yue_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
270 echo "Using Adapted PLDA, EER: Pooled ${pooled_eer}%, Tagalog ${tgl_eer}%, Cantonese ${yue_eer}%"
271 # EER: Pooled 12.98%, Tagalog 17.8%, Cantonese 8.35%
272 #
273 # Using the official SRE16 scoring software, we obtain the following equalized results:
274 #
275 # -- Pooled --
276 # EER: 13.08
277 # min_Cprimary: 0.72
278 # act_Cprimary: 0.73
279
280 # -- Cantonese --
281 # EER: 8.23
282 # min_Cprimary: 0.59
283 # act_Cprimary: 0.59
284
285 # -- Tagalog --
286 # EER: 17.87
287 # min_Cprimary: 0.84
288 # act_Cprimary: 0.87
289fi
diff --git a/egs/sre16/v1/sid b/egs/sre16/v1/sid
new file mode 120000
index 000000000..5cb0274b7
--- /dev/null
+++ b/egs/sre16/v1/sid
@@ -0,0 +1 @@
../../sre08/v1/sid/ \ No newline at end of file
diff --git a/egs/sre16/v1/steps b/egs/sre16/v1/steps
new file mode 120000
index 000000000..6e99bf5b5
--- /dev/null
+++ b/egs/sre16/v1/steps
@@ -0,0 +1 @@
../../wsj/s5/steps \ No newline at end of file
diff --git a/egs/sre16/v1/utils b/egs/sre16/v1/utils
new file mode 120000
index 000000000..b24088521
--- /dev/null
+++ b/egs/sre16/v1/utils
@@ -0,0 +1 @@
../../wsj/s5/utils \ No newline at end of file
diff --git a/egs/sre16/v2/README.txt b/egs/sre16/v2/README.txt
new file mode 100644
index 000000000..0c9cc0d15
--- /dev/null
+++ b/egs/sre16/v2/README.txt
@@ -0,0 +1,30 @@
1 This recipe replaces iVectors used in the v1 recipe with embeddings extracted
2 from a deep neural network. In the scripts, we refer to these embeddings as
3 "xvectors." The recipe is based on
4 http://www.danielpovey.com/files/2017_interspeech_embeddings.pdf but with
5 improvements due to augmentation in the DNN training data.
6
7 The recipe uses the following data for system development. This is in
8 addition to the NIST SRE 2016 dataset used for evaluation (see ../README.txt).
9
10 Corpus LDC Catalog No.
11 SWBD2 Phase 1 LDC98S75
12 SWBD2 Phase 2 LDC99S79
13 SWBD2 Phase 3 LDC2002S06
14 SWBD Cellular 1 LDC2001S13
15 SWBD Cellular 2 LDC2004S07
16 SRE2004 LDC2006S44
17 SRE2005 Train LDC2011S01
18 SRE2005 Test LDC2011S04
19 SRE2006 Train LDC2011S09
20 SRE2006 Test 1 LDC2011S10
21 SRE2006 Test 2 LDC2012S01
22 SRE2008 Train LDC2011S05
23 SRE2008 Test LDC2011S08
24 SRE2010 Eval LDC2017S06
25 Mixer 6 LDC2013S03
26
27 The following datasets are used in data augmentation.
28
29 MUSAN http://www.openslr.org/17
30 RIR_NOISES http://www.openslr.org/28
diff --git a/egs/sre16/v2/cmd.sh b/egs/sre16/v2/cmd.sh
new file mode 100755
index 000000000..d1ca1a6d1
--- /dev/null
+++ b/egs/sre16/v2/cmd.sh
@@ -0,0 +1,15 @@
1# you can change cmd.sh depending on what type of queue you are using.
2# If you have no queueing system and want to run on a local machine, you
3# can change all instances 'queue.pl' to run.pl (but be careful and run
4# commands one by one: most recipes will exhaust the memory on your
5# machine). queue.pl works with GridEngine (qsub). slurm.pl works
6# with slurm. Different queues are configured differently, with different
7# queue names and different ways of specifying things like memory;
8# to account for these differences you can create and edit the file
9# conf/queue.conf to match your queue's configuration. Search for
10# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12
13export train_cmd="queue.pl --mem 4G"
14
15
diff --git a/egs/sre16/v2/conf/mfcc.conf b/egs/sre16/v2/conf/mfcc.conf
new file mode 100644
index 000000000..d32a22179
--- /dev/null
+++ b/egs/sre16/v2/conf/mfcc.conf
@@ -0,0 +1,6 @@
1--sample-frequency=8000
2--frame-length=25 # the default is 25
3--low-freq=20 # the default.
4--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
5--num-ceps=23 # higher than the default which is 12.
6--snip-edges=false
diff --git a/egs/sre16/v2/conf/vad.conf b/egs/sre16/v2/conf/vad.conf
new file mode 100644
index 000000000..c9f5e8b30
--- /dev/null
+++ b/egs/sre16/v2/conf/vad.conf
@@ -0,0 +1,4 @@
1--vad-energy-threshold=5.5
2--vad-energy-mean-scale=0.5
3--vad-proportion-threshold=0.12
4--vad-frames-context=2
diff --git a/egs/sre16/v2/local b/egs/sre16/v2/local
new file mode 120000
index 000000000..740b697d6
--- /dev/null
+++ b/egs/sre16/v2/local
@@ -0,0 +1 @@
../v1/local/ \ No newline at end of file
diff --git a/egs/sre16/v2/path.sh b/egs/sre16/v2/path.sh
new file mode 100755
index 000000000..e50f57c52
--- /dev/null
+++ b/egs/sre16/v2/path.sh
@@ -0,0 +1,5 @@
1export KALDI_ROOT=`pwd`/../../..
2export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
3[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
4. $KALDI_ROOT/tools/config/common_path.sh
5export LC_ALL=C
diff --git a/egs/sre16/v2/run.sh b/egs/sre16/v2/run.sh
new file mode 100755
index 000000000..d18118bac
--- /dev/null
+++ b/egs/sre16/v2/run.sh
@@ -0,0 +1,320 @@
1#!/bin/bash
2# Copyright 2017 David Snyder
3# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
4# 2017 Johns Hopkins University (Author: Daniel Povey)
5# Apache 2.0.
6#
7# See README.txt for more info on data required.
8# Results (mostly EERs) are inline in comments below.
9#
10# This example demonstrates a "bare bones" NIST SRE 2016 recipe using xvectors.
11# In the future, we will add score-normalization and a more effective form of
12# PLDA domain adaptation.
13
14. cmd.sh
15. path.sh
16set -e
17mfccdir=`pwd`/mfcc
18vaddir=`pwd`/mfcc
19
20# SRE16 trials
21sre16_trials=data/sre16_eval_test/trials
22sre16_trials_tgl=data/sre16_eval_test/trials_tgl
23sre16_trials_yue=data/sre16_eval_test/trials_yue
24nnet_dir=exp/xvector_nnet_1a
25
26stage=0
27if [ $stage -le 0 ]; then
28 # Path to some, but not all of the training corpora
29 data_root=/export/corpora/LDC
30
31 # Prepare telephone and microphone speech from Mixer6.
32 local/make_mx6.sh $data_root/LDC2013S03 data/
33
34 # Prepare SRE10 test and enroll. Includes microphone interview speech.
35 # NOTE: This corpus is now available through the LDC as LDC2017S06.
36 local/make_sre10.pl /export/corpora5/SRE/SRE2010/eval/ data/
37
38 # Prepare SRE08 test and enroll. Includes some microphone speech.
39 local/make_sre08.pl $data_root/LDC2011S08 $data_root/LDC2011S05 data/
40
41 # This prepares the older NIST SREs from 2004-2006.
42 local/make_sre.sh $data_root data/
43
44 # Combine all SREs prior to 2016 and Mixer6 into one dataset
45 utils/combine_data.sh data/sre \
46 data/sre2004 data/sre2005_train \
47 data/sre2005_test data/sre2006_train \
48 data/sre2006_test_1 data/sre2006_test_2 \
49 data/sre08 data/mx6 data/sre10
50 utils/validate_data_dir.sh --no-text --no-feats data/sre
51 utils/fix_data_dir.sh data/sre
52
53 # Prepare SWBD corpora.
54 local/make_swbd_cellular1.pl $data_root/LDC2001S13 \
55 data/swbd_cellular1_train
56 local/make_swbd_cellular2.pl /export/corpora5/LDC/LDC2004S07 \
57 data/swbd_cellular2_train
58 local/make_swbd2_phase1.pl $data_root/LDC98S75 \
59 data/swbd2_phase1_train
60 local/make_swbd2_phase2.pl /export/corpora5/LDC/LDC99S79 \
61 data/swbd2_phase2_train
62 local/make_swbd2_phase3.pl /export/corpora5/LDC/LDC2002S06 \
63 data/swbd2_phase3_train
64
65 # Combine all SWB corpora into one dataset.
66 utils/combine_data.sh data/swbd \
67 data/swbd_cellular1_train data/swbd_cellular2_train \
68 data/swbd2_phase1_train data/swbd2_phase2_train data/swbd2_phase3_train
69
70 # Prepare NIST SRE 2016 evaluation data.
71 local/make_sre16_eval.pl /export/corpora5/SRE/R149_0_1 data
72
73 # Prepare unlabeled Cantonese and Tagalog development data. This dataset
74 # was distributed to SRE participants.
75 local/make_sre16_unlabeled.pl /export/corpora5/SRE/LDC2016E46_SRE16_Call_My_Net_Training_Data data
76fi
77
78if [ $stage -le 1 ]; then
79 # Make filterbanks and compute the energy-based VAD for each dataset
80 for name in sre swbd sre16_eval_enroll sre16_eval_test sre16_major; do
81 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
82 data/${name} exp/make_mfcc $mfccdir
83 utils/fix_data_dir.sh data/${name}
84 sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
85 data/${name} exp/make_vad $vaddir
86 utils/fix_data_dir.sh data/${name}
87 done
88 utils/combine_data.sh data/swbd_sre data/swbd data/sre
89 utils/fix_data_dir.sh data/swbd_sre
90fi
91
92# In this section, we augment the SWBD and SRE data with reverberation,
93# noise, music, and babble, and combined it with the clean data.
94# The combined list will be used to train the xvector DNN. The SRE
95# subset will be used to train the PLDA model.
96if [ $stage -le 2 ]; then
97 utils/data/get_utt2num_frames.sh --nj 40 --cmd "$train_cmd" data/swbd_sre
98 frame_shift=0.01
99 awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/swbd_sre/utt2num_frames > data/swbd_sre/reco2dur
100
101 if [ ! -d "RIRS_NOISES" ]; then
102 # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
103 wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
104 unzip rirs_noises.zip
105 fi
106
107 # Make a version with reverberated speech
108 rvb_opts=()
109 rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
110 rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
111
112 # Make a reverberated version of the SWBD+SRE list. Note that we don't add any
113 # additive noise here.
114 python steps/data/reverberate_data_dir.py \
115 "${rvb_opts[@]}" \
116 --speech-rvb-probability 1 \
117 --pointsource-noise-addition-probability 0 \
118 --isotropic-noise-addition-probability 0 \
119 --num-replications 1 \
120 --source-sampling-rate 8000 \
121 data/swbd_sre data/swbd_sre_reverb
122 cp data/swbd_sre/vad.scp data/swbd_sre_reverb/
123 utils/copy_data_dir.sh --utt-suffix "-reverb" data/swbd_sre_reverb data/swbd_sre_reverb.new
124 rm -rf data/swbd_sre_reverb
125 mv data/swbd_sre_reverb.new data/swbd_sre_reverb
126
127 # Prepare the MUSAN corpus, which consists of music, speech, and noise
128 # suitable for augmentation.
129 local/make_musan.sh /export/corpora/JHU/musan data
130
131 # Get the duration of the MUSAN recordings. This will be used by the
132 # script augment_data_dir.py.
133 for name in speech noise music; do
134 utils/data/get_utt2dur.sh data/musan_${name}
135 mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
136 done
137
138 # Augment with musan_noise
139 python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/swbd_sre data/swbd_sre_noise
140 # Augment with musan_music
141 python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/swbd_sre data/swbd_sre_music
142 # Augment with musan_speech
143 python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/swbd_sre data/swbd_sre_babble
144
145 # Combine reverb, noise, music, and babble into one directory.
146 utils/combine_data.sh data/swbd_sre_aug data/swbd_sre_reverb data/swbd_sre_noise data/swbd_sre_music data/swbd_sre_babble
147
148 # Take a random subset of the augmentations (128k is somewhat larger than twice
149 # the size of the SWBD+SRE list)
150 utils/subset_data_dir.sh data/swbd_sre_aug 128000 data/swbd_sre_aug_128k
151 utils/fix_data_dir.sh data/swbd_sre_aug_128k
152
153 # Make filterbanks for the augmented data. Note that we do not compute a new
154 # vad.scp file here. Instead, we use the vad.scp from the clean version of
155 # the list.
156 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
157 data/swbd_sre_aug_128k exp/make_mfcc $mfccdir
158
159 # Combine the clean and augmented SWBD+SRE list. This is now roughly
160 # double the size of the original clean list.
161 utils/combine_data.sh data/swbd_sre_combined data/swbd_sre_aug_128k data/swbd_sre
162
163 # Filter out the clean + augmented portion of the SRE list. This will be used to
164 # train the PLDA model later in the script.
165 utils/copy_data_dir.sh data/swbd_sre_combined data/sre_combined
166 utils/filter_scp.pl data/sre/spk2utt data/swbd_sre_combined/spk2utt | utils/spk2utt_to_utt2spk.pl > data/sre_combined/utt2spk
167 utils/fix_data_dir.sh data/sre_combined
168fi
169
170# Now we prepare the features to generate examples for xvector training.
171if [ $stage -le 3 ]; then
172 # This script applies CMVN and removes nonspeech frames. Note that this is somewhat
173 # wasteful, as it roughly doubles the amount of training data on disk. After
174 # creating training examples, this can be removed.
175 local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \
176 data/swbd_sre_combined data/swbd_sre_combined_no_sil exp/swbd_sre_combined_no_sil
177 utils/fix_data_dir.sh data/swbd_sre_combined_no_sil
178 utils/data/get_utt2num_frames.sh --nj 40 --cmd "$train_cmd" data/swbd_sre_combined_no_sil
179 utils/fix_data_dir.sh data/swbd_sre_combined_no_sil
180
181 # Now, we need to remove features that are too short after removing silence
182 # frames. We want atleast 5s (500 frames) per utterance.
183 min_len=500
184 mv data/swbd_sre_combined_no_sil/utt2num_frames data/swbd_sre_combined_no_sil/utt2num_frames.bak
185 awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/swbd_sre_combined_no_sil/utt2num_frames.bak > data/swbd_sre_combined_no_sil/utt2num_frames
186 utils/filter_scp.pl data/swbd_sre_combined_no_sil/utt2num_frames data/swbd_sre_combined_no_sil/utt2spk > data/swbd_sre_combined_no_sil/utt2spk.new
187 mv data/swbd_sre_combined_no_sil/utt2spk.new data/swbd_sre_combined_no_sil/utt2spk
188 utils/fix_data_dir.sh data/swbd_sre_combined_no_sil
189
190 # We also want several utterances per speaker. Now we'll throw out speakers
191 # with fewer than 8 utterances.
192 min_num_utts=8
193 awk '{print $1, NF-1}' data/swbd_sre_combined_no_sil/spk2utt > data/swbd_sre_combined_no_sil/spk2num
194 awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/swbd_sre_combined_no_sil/spk2num | utils/filter_scp.pl - data/swbd_sre_combined_no_sil/spk2utt > data/swbd_sre_combined_no_sil/spk2utt.new
195 mv data/swbd_sre_combined_no_sil/spk2utt.new data/swbd_sre_combined_no_sil/spk2utt
196 utils/spk2utt_to_utt2spk.pl data/swbd_sre_combined_no_sil/spk2utt > data/swbd_sre_combined_no_sil/utt2spk
197
198 utils/filter_scp.pl data/swbd_sre_combined_no_sil/utt2spk data/swbd_sre_combined_no_sil/utt2num_frames > data/swbd_sre_combined_no_sil/utt2num_frames.new
199 mv data/swbd_sre_combined_no_sil/utt2num_frames.new data/swbd_sre_combined_no_sil/utt2num_frames
200
201 # Now we're reaady to create training examples.
202 utils/fix_data_dir.sh data/swbd_sre_combined_no_sil
203fi
204
205local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \
206 --data data/swbd_sre_combined_no_sil --nnet-dir $nnet_dir \
207 --egs-dir $nnet_dir/egs
208
209if [ $stage -le 7 ]; then
210 # The SRE16 major is an unlabeled dataset consisting of Cantonese and
211 # and Tagalog. This is useful for things like centering, whitening and
212 # score normalization.
213 sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
214 $nnet_dir data/sre16_major \
215 exp/xvectors_sre16_major
216
217 # Extract xvectors for SRE data (includes Mixer 6). We'll use this for
218 # things like LDA or PLDA.
219 sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 12G" --nj 40 \
220 $nnet_dir data/sre_combined \
221 exp/xvectors_sre_combined
222
223 # The SRE16 test data
224 sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
225 $nnet_dir data/sre16_eval_test \
226 exp/xvectors_sre16_eval_test
227
228 # The SRE16 enroll data
229 sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
230 $nnet_dir data/sre16_eval_enroll \
231 exp/xvectors_sre16_eval_enroll
232fi
233
234if [ $stage -le 8 ]; then
235 # Compute the mean vector for centering the evaluation xvectors.
236 $train_cmd exp/xvectors_sre16_major/log/compute_mean.log \
237 ivector-mean scp:exp/xvectors_sre16_major/xvector.scp \
238 exp/xvectors_sre16_major/mean.vec || exit 1;
239
240 # This script uses LDA to decrease the dimensionality prior to PLDA.
241 lda_dim=150
242 $train_cmd exp/xvectors_sre_combined/log/lda.log \
243 ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
244 "ark:ivector-subtract-global-mean scp:exp/xvectors_sre_combined/xvector.scp ark:- |" \
245 ark:data/sre_combined/utt2spk exp/xvectors_sre_combined/transform.mat || exit 1;
246
247 # Train an out-of-domain PLDA model.
248 $train_cmd exp/xvectors_sre_combined/log/plda.log \
249 ivector-compute-plda ark:data/sre_combined/spk2utt \
250 "ark:ivector-subtract-global-mean scp:exp/xvectors_sre_combined/xvector.scp ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
251 exp/xvectors_sre_combined/plda || exit 1;
252
253 # Here we adapt the out-of-domain PLDA model to SRE16 major, a pile
254 # of unlabeled in-domain data. In the future, we will include a clustering
255 # based approach for domain adaptation, which tends to work better.
256 $train_cmd exp/xvectors_sre16_major/log/plda_adapt.log \
257 ivector-adapt-plda --within-covar-scale=0.75 --between-covar-scale=0.25 \
258 exp/xvectors_sre_combined/plda \
259 "ark:ivector-subtract-global-mean scp:exp/xvectors_sre16_major/xvector.scp ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
260 exp/xvectors_sre16_major/plda_adapt || exit 1;
261fi
262
263if [ $stage -le 9 ]; then
264 # Get results using the out-of-domain PLDA model.
265 $train_cmd exp/scores/log/sre16_eval_scoring.log \
266 ivector-plda-scoring --normalize-length=true \
267 --num-utts=ark:exp/xvectors_sre16_eval_enroll/num_utts.ark \
268 "ivector-copy-plda --smoothing=0.0 exp/xvectors_sre_combined/plda - |" \
269 "ark:ivector-mean ark:data/sre16_eval_enroll/spk2utt scp:exp/xvectors_sre16_eval_enroll/xvector.scp ark:- | ivector-subtract-global-mean exp/xvectors_sre16_major/mean.vec ark:- ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
270 "ark:ivector-subtract-global-mean exp/xvectors_sre16_major/mean.vec scp:exp/xvectors_sre16_eval_test/xvector.scp ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
271 "cat '$sre16_trials' | cut -d\ --fields=1,2 |" exp/scores/sre16_eval_scores || exit 1;
272
273 utils/filter_scp.pl $sre16_trials_tgl exp/scores/sre16_eval_scores > exp/scores/sre16_eval_tgl_scores
274 utils/filter_scp.pl $sre16_trials_yue exp/scores/sre16_eval_scores > exp/scores/sre16_eval_yue_scores
275 pooled_eer=$(paste $sre16_trials exp/scores/sre16_eval_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
276 tgl_eer=$(paste $sre16_trials_tgl exp/scores/sre16_eval_tgl_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
277 yue_eer=$(paste $sre16_trials_yue exp/scores/sre16_eval_yue_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
278 echo "Using Out-of-Domain PLDA, EER: Pooled ${pooled_eer}%, Tagalog ${tgl_eer}%, Cantonese ${yue_eer}%"
279 # EER: Pooled 11.73%, Tagalog 15.96%, Cantonese 7.52%
280 # For reference, here's the ivector system from ../v1:
281 # EER: Pooled 13.65%, Tagalog 17.73%, Cantonese 9.61%
282fi
283
284if [ $stage -le 10 ]; then
285 # Get results using the adapted PLDA model.
286 $train_cmd exp/scores/log/sre16_eval_scoring_adapt.log \
287 ivector-plda-scoring --normalize-length=true \
288 --num-utts=ark:exp/xvectors_sre16_eval_enroll/num_utts.ark \
289 "ivector-copy-plda --smoothing=0.0 exp/xvectors_sre16_major/plda_adapt - |" \
290 "ark:ivector-mean ark:data/sre16_eval_enroll/spk2utt scp:exp/xvectors_sre16_eval_enroll/xvector.scp ark:- | ivector-subtract-global-mean exp/xvectors_sre16_major/mean.vec ark:- ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
291 "ark:ivector-subtract-global-mean exp/xvectors_sre16_major/mean.vec scp:exp/xvectors_sre16_eval_test/xvector.scp ark:- | transform-vec exp/xvectors_sre_combined/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
292 "cat '$sre16_trials' | cut -d\ --fields=1,2 |" exp/scores/sre16_eval_scores_adapt || exit 1;
293
294 utils/filter_scp.pl $sre16_trials_tgl exp/scores/sre16_eval_scores_adapt > exp/scores/sre16_eval_tgl_scores_adapt
295 utils/filter_scp.pl $sre16_trials_yue exp/scores/sre16_eval_scores_adapt > exp/scores/sre16_eval_yue_scores_adapt
296 pooled_eer=$(paste $sre16_trials exp/scores/sre16_eval_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
297 tgl_eer=$(paste $sre16_trials_tgl exp/scores/sre16_eval_tgl_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
298 yue_eer=$(paste $sre16_trials_yue exp/scores/sre16_eval_yue_scores_adapt | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
299 echo "Using Adapted PLDA, EER: Pooled ${pooled_eer}%, Tagalog ${tgl_eer}%, Cantonese ${yue_eer}%"
300 # EER: Pooled 8.57%, Tagalog 12.29%, Cantonese 4.89%
301 # For reference, here's the ivector system from ../v1:
302 # EER: Pooled 12.98%, Tagalog 17.8%, Cantonese 8.35%
303 #
304 # Using the official SRE16 scoring software, we obtain the following equalized results:
305 #
306 # -- Pooled --
307 # EER: 8.66
308 # min_Cprimary: 0.61
309 # act_Cprimary: 0.62
310 #
311 # -- Cantonese --
312 # EER: 4.69
313 # min_Cprimary: 0.42
314 # act_Cprimary: 0.43
315 #
316 # -- Tagalog --
317 # EER: 12.63
318 # min_Cprimary: 0.76
319 # act_Cprimary: 0.81
320fi
diff --git a/egs/sre16/v2/sid b/egs/sre16/v2/sid
new file mode 120000
index 000000000..5cb0274b7
--- /dev/null
+++ b/egs/sre16/v2/sid
@@ -0,0 +1 @@
../../sre08/v1/sid/ \ No newline at end of file
diff --git a/egs/sre16/v2/steps b/egs/sre16/v2/steps
new file mode 120000
index 000000000..1b186770d
--- /dev/null
+++ b/egs/sre16/v2/steps
@@ -0,0 +1 @@
../../wsj/s5/steps/ \ No newline at end of file
diff --git a/egs/sre16/v2/utils b/egs/sre16/v2/utils
new file mode 120000
index 000000000..b24088521
--- /dev/null
+++ b/egs/sre16/v2/utils
@@ -0,0 +1 @@
../../wsj/s5/utils \ No newline at end of file
diff --git a/egs/wsj/s5/steps/data/augment_data_dir.py b/egs/wsj/s5/steps/data/augment_data_dir.py
new file mode 100755
index 000000000..520e7b56e
--- /dev/null
+++ b/egs/wsj/s5/steps/data/augment_data_dir.py
@@ -0,0 +1,194 @@
1#!/usr/bin/env python3
2# Copyright 2017 David Snyder
3# Apache 2.0
4#
5# This script generates augmented data. It is based on
6# steps/data/reverberate_data_dir.py but doesn't handle reverberation.
7# It is designed to be somewhat simpler and more flexible for augmenting with
8# additive noise.
9from __future__ import print_function
10import sys, random, argparse, os, imp
11sys.path.append("steps/data/")
12from reverberate_data_dir import ParseFileToDict
13from reverberate_data_dir import WriteDictToFile
14data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
15
16def GetArgs():
17 parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. "
18 "Noises are separated into background and foreground noises which are added together or "
19 "separately. Background noises are added to the entire recording, and repeated as necessary "
20 "to cover the full length. Multiple overlapping background noises can be added, to simulate "
21 "babble, for example. Foreground noises are added sequentially, according to a specified "
22 "interval. See also steps/data/reverberate_data_dir.py "
23 "Usage: augment_data_dir.py [options...] <in-data-dir> <out-data-dir> "
24 "E.g., steps/data/augment_data_dir.py --utt-suffix aug --fg-snrs 20:10:5:0 --bg-snrs 20:15:10 "
25 "--num-bg-noise 1:2:3 --fg-interval 3 --fg-noise-dir data/musan_noise --bg-noise-dir "
26 "data/musan_music data/train data/train_aug", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
27 parser.add_argument('--fg-snrs', type=str, dest = "fg_snr_str", default = '20:10:0',
28 help='When foreground noises are being added, the script will iterate through these SNRs.')
29 parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0',
30 help='When background noises are being added, the script will iterate through these SNRs.')
31 parser.add_argument('--num-bg-noises', type=str, dest = "num_bg_noises", default = '1',
32 help='Number of overlapping background noises that we iterate over. For example, if the input is "1:2:3" then the output wavs will have either 1, 2, or 3 randomly chosen background noises overlapping the entire recording')
33 parser.add_argument('--fg-interval', type=int, dest = "fg_interval", default = 0,
34 help='Number of seconds between the end of one foreground noise and the beginning of the next.')
35 parser.add_argument('--utt-suffix', type=str, dest = "utt_suffix", default = "aug", help='Suffix added to utterance IDs.')
36 parser.add_argument('--random-seed', type=int, dest = "random_seed", default = 123, help='Random seed.')
37
38 parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir",
39 help="Background noise data directory")
40 parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir",
41 help="Foreground noise data directory")
42 parser.add_argument("input_dir", help="Input data directory")
43 parser.add_argument("output_dir", help="Output data directory")
44
45 print(' '.join(sys.argv))
46 args = parser.parse_args()
47 args = CheckArgs(args)
48 return args
49
50def CheckArgs(args):
51 if not os.path.exists(args.output_dir):
52 os.makedirs(args.output_dir)
53 if not args.fg_interval >= 0:
54 raise Exception("--fg-interval must be 0 or greater")
55 if args.bg_noise_dir is None and args.fg_noise_dir is None:
56 raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified")
57 return args
58
59def GetNoiseList(noise_wav_scp_filename):
60 noise_wav_scp_file = open(noise_wav_scp_filename, 'r').readlines()
61 noise_wavs = {}
62 noise_utts = []
63 for line in noise_wav_scp_file:
64 toks=line.split(" ")
65 wav = " ".join(toks[1:])
66 noise_utts.append(toks[0])
67 noise_wavs[toks[0]] = wav.rstrip()
68 return noise_utts, noise_wavs
69
70def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
71 bg_noise_utts, noise_wavs, noise2dur, interval, num_opts):
72 # This section is common to both foreground and background noises
73 new_wav = ""
74 dur_str = str(dur)
75 noise_dur = 0
76 tot_noise_dur = 0
77 snrs=[]
78 noises=[]
79 start_times=[]
80
81 # Now handle the background noises
82 if len(bg_noise_utts) > 0:
83 num = random.choice(num_opts)
84 for i in range(0, num):
85 noise_utt = random.choice(bg_noise_utts)
86 noise = noise_wavs[noise_utt] + " wav-reverberate --duration=" \
87 + dur_str + " - - |"
88 snr = random.choice(bg_snr_opts)
89 snrs.append(snr)
90 start_times.append(0)
91 noises.append(noise)
92
93 # Now handle the foreground noises
94 if len(fg_noise_utts) > 0:
95 while tot_noise_dur < dur:
96 noise_utt = random.choice(fg_noise_utts)
97 noise = noise_wavs[noise_utt]
98 snr = random.choice(fg_snr_opts)
99 snrs.append(snr)
100 noise_dur = noise2dur[noise_utt]
101 start_times.append(tot_noise_dur)
102 tot_noise_dur += noise_dur + interval
103 noises.append(noise)
104
105 start_times_str = "--start-times='" + ",".join(map(str,start_times)) + "'"
106 snrs_str = "--snrs='" + ",".join(map(str,snrs)) + "'"
107 noises_str = "--additive-signals='" + ",".join(noises) + "'"
108
109 # If the wav is just a file
110 if len(wav.split()) == 1:
111 new_wav = "wav-reverberate --shift-output=true " + noises_str + " " \
112 + start_times_str + " " + snrs_str + " " + wav + " - |"
113 # Else if the wav is in a pipe
114 else:
115 new_wav = wav + "wav-reverberate --shift-output=true " + noises_str + " " \
116 + start_times_str + " " + snrs_str + " - - |"
117 return new_wav
118
119def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir):
120 if os.path.isfile(input_dir + "/" + filename):
121 dict = ParseFileToDict(input_dir + "/" + filename,
122 value_processor = lambda x: " ".join(x))
123 if len(utt_suffix) > 0:
124 new_dict = {}
125 for key in dict.keys():
126 new_dict[key + "-" + utt_suffix] = dict[key]
127 dict = new_dict
128 WriteDictToFile(dict, output_dir + "/" + filename)
129
130def main():
131 args = GetArgs()
132 fg_snrs = map(int, args.fg_snr_str.split(":"))
133 bg_snrs = map(int, args.bg_snr_str.split(":"))
134 input_dir = args.input_dir
135 output_dir = args.output_dir
136 num_bg_noises = map(int, args.num_bg_noises.split(":"))
137 reco2dur = ParseFileToDict(input_dir + "/reco2dur",
138 value_processor = lambda x: float(x[0]))
139 wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines()
140
141 noise_wavs = {}
142 noise_reco2dur = {}
143 bg_noise_utts = []
144 fg_noise_utts = []
145
146 # Load background noises
147 if args.bg_noise_dir:
148 bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp"
149 bg_noise_utts, bg_noise_wavs = GetNoiseList(bg_noise_wav_filename)
150 bg_noise_reco2dur = ParseFileToDict(args.bg_noise_dir + "/reco2dur",
151 value_processor = lambda x: float(x[0]))
152 noise_wavs.update(bg_noise_wavs)
153 noise_reco2dur.update(bg_noise_reco2dur)
154
155 # Load background noises
156 if args.fg_noise_dir:
157 fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp"
158 fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur"
159 fg_noise_utts, fg_noise_wavs = GetNoiseList(fg_noise_wav_filename)
160 fg_noise_reco2dur = ParseFileToDict(args.fg_noise_dir + "/reco2dur",
161 value_processor = lambda x: float(x[0]))
162 noise_wavs.update(fg_noise_wavs)
163 noise_reco2dur.update(fg_noise_reco2dur)
164
165 random.seed(args.random_seed)
166 new_utt2wav = {}
167 new_utt2spk = {}
168
169 # Augment each line in the wav file
170 for line in wav_scp_file:
171 toks = line.rstrip().split(" ")
172 utt = toks[0]
173 wav = " ".join(toks[1:])
174 dur = reco2dur[utt]
175 new_wav = AugmentWav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
176 bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval,
177 num_bg_noises)
178 new_utt = utt + "-" + args.utt_suffix
179 new_utt2wav[new_utt] = new_wav
180
181 if not os.path.exists(output_dir):
182 os.makedirs(output_dir)
183
184 WriteDictToFile(new_utt2wav, output_dir + "/wav.scp")
185 CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
186 CopyFileIfExists(args.utt_suffix, "utt2lang", input_dir, output_dir)
187 CopyFileIfExists(args.utt_suffix, "text", input_dir, output_dir)
188 CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
189 CopyFileIfExists(args.utt_suffix, "vad.scp", input_dir, output_dir)
190 CopyFileIfExists("", "spk2gender", input_dir, output_dir)
191 data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir))
192
193if __name__ == "__main__":
194 main()
diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py
index 0083efa49..71e64d9e6 100755
--- a/egs/wsj/s5/steps/data/reverberate_data_dir.py
+++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py
@@ -20,7 +20,7 @@ def GetArgs():
20 "--random-seed 1 data/train data/train_rvb", 20 "--random-seed 1 data/train data/train_rvb",
21 formatter_class=argparse.ArgumentDefaultsHelpFormatter) 21 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22 22
23 parser.add_argument("--rir-set-parameters", type=str, action='append', required = True, dest = "rir_set_para_array", 23 parser.add_argument("--rir-set-parameters", type=str, action='append', required = True, dest = "rir_set_para_array",
24 help="Specifies the parameters of an RIR set. " 24 help="Specifies the parameters of an RIR set. "
25 "Supports the specification of mixture_weight and rir_list_file_name. The mixture weight is optional. " 25 "Supports the specification of mixture_weight and rir_list_file_name. The mixture weight is optional. "
26 "The default mixture weight is the probability mass remaining after adding the mixture weights " 26 "The default mixture weight is the probability mass remaining after adding the mixture weights "
@@ -104,7 +104,7 @@ def CheckArgs(args):
104 104
105 if args.isotropic_noise_addition_probability < 0 or args.isotropic_noise_addition_probability > 1: 105 if args.isotropic_noise_addition_probability < 0 or args.isotropic_noise_addition_probability > 1:
106 raise Exception("--isotropic-noise-addition-probability must be between 0 and 1") 106 raise Exception("--isotropic-noise-addition-probability must be between 0 and 1")
107 107
108 if args.rir_smoothing_weight < 0 or args.rir_smoothing_weight > 1: 108 if args.rir_smoothing_weight < 0 or args.rir_smoothing_weight > 1:
109 raise Exception("--rir-smoothing-weight must be between 0 and 1") 109 raise Exception("--rir-smoothing-weight must be between 0 and 1")
110 110
@@ -113,7 +113,7 @@ def CheckArgs(args):
113 113
114 if args.max_noises_per_minute < 0: 114 if args.max_noises_per_minute < 0:
115 raise Exception("--max-noises-per-minute cannot be negative") 115 raise Exception("--max-noises-per-minute cannot be negative")
116 116
117 if args.source_sampling_rate is not None and args.source_sampling_rate <= 0: 117 if args.source_sampling_rate is not None and args.source_sampling_rate <= 0:
118 raise Exception("--source-sampling-rate cannot be non-positive") 118 raise Exception("--source-sampling-rate cannot be non-positive")
119 119
@@ -133,7 +133,7 @@ class list_cyclic_iterator:
133 133
134 134
135# This functions picks an item from the collection according to the associated probability distribution. 135# This functions picks an item from the collection according to the associated probability distribution.
136# The probability estimate of each item in the collection is stored in the "probability" field of 136# The probability estimate of each item in the collection is stored in the "probability" field of
137# the particular item. x : a collection (list or dictionary) where the values contain a field called probability 137# the particular item. x : a collection (list or dictionary) where the values contain a field called probability
138def PickItemWithProbability(x): 138def PickItemWithProbability(x):
139 if isinstance(x, dict): 139 if isinstance(x, dict):
@@ -155,7 +155,6 @@ def PickItemWithProbability(x):
155def ParseFileToDict(file, assert2fields = False, value_processor = None): 155def ParseFileToDict(file, assert2fields = False, value_processor = None):
156 if value_processor is None: 156 if value_processor is None:
157 value_processor = lambda x: x[0] 157 value_processor = lambda x: x[0]
158
159 dict = {} 158 dict = {}
160 for line in open(file, 'r'): 159 for line in open(file, 'r'):
161 parts = line.split() 160 parts = line.split()
@@ -236,7 +235,7 @@ def AddPointSourceNoise(noise_addition_descriptor, # descriptor to store the in
236 235
237 236
238# This function randomly decides whether to reverberate, and sample a RIR if it does 237# This function randomly decides whether to reverberate, and sample a RIR if it does
239# It also decides whether to add the appropriate noises 238# It also decides whether to add the appropriate noises
240# This function return the string of options to the binary wav-reverberate 239# This function return the string of options to the binary wav-reverberate
241def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format 240def GenerateReverberationOpts(room_dict, # the room dictionary, please refer to MakeRoomDict() for the format
242 pointsource_noise_list, # the point source noise list 241 pointsource_noise_list, # the point source noise list
@@ -306,15 +305,15 @@ def GetNewId(id, prefix=None, copy=0):
306 new_id = id 305 new_id = id
307 306
308 return new_id 307 return new_id
309 308
310 309
311# This is the main function to generate pipeline command for the corruption 310# This is the main function to generate pipeline command for the corruption
312# The generic command of wav-reverberate will be like: 311# The generic command of wav-reverberate will be like:
313# wav-reverberate --duration=t --impulse-response=rir.wav 312# wav-reverberate --duration=t --impulse-response=rir.wav
314# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav 313# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
315def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings 314def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings
316 durations, # a dictionary whose values are the duration (in sec) of the speech recordings 315 durations, # a dictionary whose values are the duration (in sec) of the speech recordings
317 output_dir, # output directory to write the corrupted wav.scp 316 output_dir, # output directory to write the corrupted wav.scp
318 room_dict, # the room dictionary, please refer to MakeRoomDict() for the format 317 room_dict, # the room dictionary, please refer to MakeRoomDict() for the format
319 pointsource_noise_list, # the point source noise list 318 pointsource_noise_list, # the point source noise list
320 iso_noise_dict, # the isotropic noise dictionary 319 iso_noise_dict, # the isotropic noise dictionary
@@ -358,11 +357,11 @@ def GenerateReverberatedWavScp(wav_scp, # a dictionary whose values are the Kal
358 pointsource_noise_addition_probability, # Probability of adding point-source noises 357 pointsource_noise_addition_probability, # Probability of adding point-source noises
359 speech_dur, # duration of the recording 358 speech_dur, # duration of the recording
360 max_noises_recording # Maximum number of point-source noises that can be added 359 max_noises_recording # Maximum number of point-source noises that can be added
361 ) 360 )
362 361
363 # prefix using index 0 is reserved for original data e.g. rvb0_swb0035 corresponds to the swb0035 recording in original data 362 # prefix using index 0 is reserved for original data e.g. rvb0_swb0035 corresponds to the swb0035 recording in original data
364 if reverberate_opts == "" or i == 0: 363 if reverberate_opts == "" or i == 0:
365 wav_corrupted_pipe = "{0}".format(wav_original_pipe) 364 wav_corrupted_pipe = "{0}".format(wav_original_pipe)
366 else: 365 else:
367 wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts) 366 wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts)
368 367
@@ -380,7 +379,7 @@ def AddPrefixToFields(input_file, output_file, num_replicas, include_original, p
380 start_index = 0 379 start_index = 0
381 else: 380 else:
382 start_index = 1 381 start_index = 1
383 382
384 for i in range(start_index, num_replicas+1): 383 for i in range(start_index, num_replicas+1):
385 for line in list: 384 for line in list:
386 if len(line) > 0 and line[0] != ';': 385 if len(line) > 0 and line[0] != ';':
@@ -410,7 +409,7 @@ def CreateReverberatedCopy(input_dir,
410 pointsource_noise_addition_probability, # Probability of adding point-source noises 409 pointsource_noise_addition_probability, # Probability of adding point-source noises
411 max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration 410 max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration
412 ): 411 ):
413 412
414 wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) 413 wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
415 if not os.path.isfile(input_dir + "/reco2dur"): 414 if not os.path.isfile(input_dir + "/reco2dur"):
416 print("Getting the duration of the recordings..."); 415 print("Getting the duration of the recordings...");
@@ -426,8 +425,8 @@ def CreateReverberatedCopy(input_dir,
426 background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) 425 background_snr_array = map(lambda x: float(x), background_snr_string.split(':'))
427 426
428 GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict, 427 GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
429 foreground_snr_array, background_snr_array, num_replicas, include_original, prefix, 428 foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
430 speech_rvb_probability, shift_output, isotropic_noise_addition_probability, 429 speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
431 pointsource_noise_addition_probability, max_noises_per_minute) 430 pointsource_noise_addition_probability, max_noises_per_minute)
432 431
433 AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1]) 432 AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
@@ -447,7 +446,7 @@ def CreateReverberatedCopy(input_dir,
447 if os.path.isfile(input_dir + "/reco2file_and_channel"): 446 if os.path.isfile(input_dir + "/reco2file_and_channel"):
448 AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1]) 447 AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
449 448
450 data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats {output_dir}" 449 data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
451 .format(output_dir = output_dir)) 450 .format(output_dir = output_dir))
452 451
453 452
@@ -507,7 +506,7 @@ def ParseSetParameterStrings(set_para_array):
507 return SmoothProbabilityDistribution(set_list) 506 return SmoothProbabilityDistribution(set_list)
508 507
509 508
510# This function creates the RIR list 509# This function creates the RIR list
511# Each rir object in the list contains the following attributes: 510# Each rir object in the list contains the following attributes:
512# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability 511# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
513# Please refer to the help messages in the parser for the meaning of these attributes 512# Please refer to the help messages in the parser for the meaning of these attributes
@@ -521,7 +520,7 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
521 rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse response.') 520 rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse response.')
522 rir_parser.add_argument('--cte', type=float, default=None, help='Early-to-late index of the impulse response.') 521 rir_parser.add_argument('--cte', type=float, default=None, help='Early-to-late index of the impulse response.')
523 rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse response.') 522 rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse response.')
524 rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command. 523 rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
525 E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """) 524 E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """)
526 525
527 set_list = ParseSetParameterStrings(rir_set_para_array) 526 set_list = ParseSetParameterStrings(rir_set_para_array)
@@ -569,7 +568,7 @@ def MakeRoomDict(rir_list):
569 return room_dict 568 return room_dict
570 569
571 570
572# This function creates the point-source noise list 571# This function creates the point-source noise list
573# and the isotropic noise dictionary from the noise information file 572# and the isotropic noise dictionary from the noise information file
574# The isotropic noise dictionary is indexed by the room 573# The isotropic noise dictionary is indexed by the room
575# and its value is the corrresponding isotropic noise list 574# and its value is the corrresponding isotropic noise list
@@ -596,7 +595,7 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
596 current_noise_list = map(lambda x: noise_parser.parse_args(shlex.split(x.strip())),open(noise_set.filename)) 595 current_noise_list = map(lambda x: noise_parser.parse_args(shlex.split(x.strip())),open(noise_set.filename))
597 current_pointsource_noise_list = [] 596 current_pointsource_noise_list = []
598 for noise in current_noise_list: 597 for noise in current_noise_list:
599 if sampling_rate is not None: 598 if sampling_rate is not None:
600 # check if the rspecifier is a pipe or not 599 # check if the rspecifier is a pipe or not
601 if len(noise.noise_rspecifier.split()) == 1: 600 if len(noise.noise_rspecifier.split()) == 1:
602 noise.noise_rspecifier = "sox {0} -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate) 601 noise.noise_rspecifier = "sox {0} -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate)
@@ -615,11 +614,11 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
615 614
616 pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability) 615 pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
617 616
618 # ensure the point-source noise probabilities sum to 1 617 # ensure the point-source noise probabilities sum to 1
619 pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0) 618 pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0)
620 if len(pointsource_noise_list) > 0: 619 if len(pointsource_noise_list) > 0:
621 assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0) 620 assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0)
622 621
623 # ensure the isotropic noise source probabilities for a given room sum to 1 622 # ensure the isotropic noise source probabilities for a given room sum to 1
624 for key in iso_noise_dict.keys(): 623 for key in iso_noise_dict.keys():
625 iso_noise_dict[key] = SmoothProbabilityDistribution(iso_noise_dict[key]) 624 iso_noise_dict[key] = SmoothProbabilityDistribution(iso_noise_dict[key])
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/stats_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/stats_layer.py
index c114833d4..77e7bbb33 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/stats_layer.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/stats_layer.py
@@ -64,9 +64,12 @@ class XconfigStatsLayer(XconfigLayerBase):
64 self._stats_period = int(m.group(4)) 64 self._stats_period = int(m.group(4))
65 self._right_context = int(m.group(5)) 65 self._right_context = int(m.group(5))
66 66
67 output_dim = (self.descriptors['input']['dim'] 67 if self._output_stddev:
68 * (2 if self._output_stddev else 1) 68 output_dim = 2 * self.descriptors['input']['dim']
69 + 1 if self._output_log_counts else 0) 69 else:
70 output_dim = self.descriptors['input']['dim']
71 if self._output_log_counts:
72 output_dim = output_dim + 1
70