-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconvert_tsv_to_scp.py
executable file
·67 lines (60 loc) · 3.05 KB
/
convert_tsv_to_scp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
import argparse, os, re
parser = argparse.ArgumentParser(description='Convert a TSV file to Kaldi SCP files.')
parser.add_argument('filename', help='Dataset TSV file to convert.')
parser.add_argument('output_dir', nargs='?', default='dataset', help='Directory to save the output files.')
parser.add_argument('-l', '--lexicon_file', default='kaldi_model_daanzu_20200905_1ep-mediumlm-base/dict/lexicon.txt', help='Filename of the lexicon file, for filtering out out-of-vocabulary utterances.')
parser.add_argument('--no_lexicon', action='store_true', help='Do not filter utterances based on lexicon to remove ones containing out-of-vocabulary words.')
parser.add_argument('--no_normalize', action='store_true', help='Do not normalize the input text (lower casing, and removing punctuation).')
args = parser.parse_args()
if not os.path.exists(args.filename):
raise Exception('File does not exist: %s' % args.filename)
os.makedirs(args.output_dir, exist_ok=True)
lexicon = set()
if args.lexicon_file:
with open(args.lexicon_file, 'r') as f:
for line in f:
word = line.strip().split(None, 1)[0]
lexicon.add(word)
else:
print("WARNING: No lexicon file specified.")
def normalize_script(script):
script = re.sub(r'[\-]', ' ', script)
script = re.sub(r'[,.?!:;"]', '', script)
return script.strip().lower()
utt2spk_dict, wav_dict, text_dict = {}, {}, {}
num_entries, num_dropped_lexicon, num_dropped_missing_wav = 0, 0, 0
with open(args.filename, 'r') as f:
for line in f:
num_entries += 1
fields = line.rstrip('\n').split('\t')
text = fields[4]
wav_path = fields[0]
utt_id = os.path.splitext(os.path.basename(wav_path))[0]
if not args.no_normalize:
text = normalize_script(text)
if lexicon and any([word not in lexicon for word in text.split()]):
num_dropped_lexicon += 1
continue
if not os.path.exists(wav_path):
num_dropped_missing_wav += 1
continue
utt2spk_dict[utt_id] = utt_id
wav_dict[utt_id] = wav_path
text_dict[utt_id] = text
with open(os.path.join(args.output_dir, 'utt2spk'), 'w') as f:
for (key, val) in utt2spk_dict.items():
f.write('%s %s\n' % (key, val))
with open(os.path.join(args.output_dir, 'wav.scp'), 'w') as f:
for (key, val) in wav_dict.items():
f.write('%s %s\n' % (key, val))
with open(os.path.join(args.output_dir, 'text'), 'w') as f:
for (key, val) in text_dict.items():
f.write('%s %s\n' % (key, val))
if num_dropped_lexicon:
print(f"{num_dropped_lexicon} ({num_dropped_lexicon / num_entries * 100:.1f}%) utterances dropped because they contained out-of-lexicon words.")
if num_dropped_missing_wav:
print(f"{num_dropped_missing_wav} ({num_dropped_missing_wav / num_entries * 100:.1f}%) utterances dropped because couldn't find wav file at given path.")
if not len(text_dict):
raise Exception("No utterances remaining! Failure!")
print(f"Wrote training dataset ({len(text_dict)} utterances) to: {args.output_dir}")