add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/swbd/s0/conf/train_conformer.yaml
+++ b/examples/swbd/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        # batch_size: 32
+        batch_size: 16
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/swbd/s0/local/MSU_single_letter.txt
+++ b/examples/swbd/s0/local/MSU_single_letter.txt
+A ey
+B b iy
+C s iy
+D d iy
+E iy
+F eh f
+G jh iy
+H ey ch
+I ay
+J jh ey
+K k ey
+L eh l
+M eh m
+N eh n
+O ow
+P p iy
+Q k y uw
+R aa r
+S eh s
+T t iy
+U y uw
+V v iy
+W d ah b ax l y uw
+X eh k s
+Y w ay
+Z z iy
--- a/examples/swbd/s0/local/dict.patch
+++ b/examples/swbd/s0/local/dict.patch
+1d0
+<  file: $SWB/data/dictionary/sw-ms98-dict.text
+8645a8646
+> uh-hum ah m hh ah m
+9006c9007
+< April ey p r ih l
+---
+> April ey p r ax l
+9144d9144
+< B ay zh aa n iy z
+9261c9261
+< Battle b ae t el
+---
+> Battle b ae t ax l
+10014a10015
+> Chevy sh eh v iy
+10211a10213
+> Colorado k ao l ax r aa d ow
+10212a10215
+> Colorado' k ao l ax r aa d ow z
+10370c10373
+< Creek k r ih k
+---
+> Creek k r iy k
+10889a10893
+> Eleven ax l eh v ih n
+10951c10955
+< Erie ih r iy
+---
+> Erie iy r iy
+11183c11187
+< Forever f ax r eh v er
+---
+> Forever f er eh v er
+11231a11236
+> Friday f r ay d iy
+11744a11750
+> History hh ih s t r iy
+12004a12011,12012
+> Israel ih z r ih l
+> Israel's ih z r ih l z
+12573a12582
+> Lincoln l ih ng k ih n
+12574a12584
+> Lincolns l ih ng k ih n z
+13268c13278
+< NAACP eh ey ey s iy p iy
+---
+> NAACP eh n ey ey s iy p iy
+13286c13296
+< NIT eh ay t iy
+---
+> NIT eh n ay t iy
+13292c13302
+< NTSC eh t iy eh s s iy
+---
+> NTSC eh n t iy eh s s iy
+14058a14069
+> Quarter k ow r t er
+14059a14071
+> Quarterback k ow r t er b ae k
+14060a14073
+> Quarters k ow r t er z
+14569a14583
+> Science s ay n s
+15087a15102
+> Sunday s ah n d iy
+15088a15104
+> Sunday's s ah n d iy z
+15089a15106
+> Sundays s ah n d iy z
+15290,15291c15307,15308
+< Texan t eh k sh ih n
+< Texan's t eh k sh ih n s
+---
+> Texan t eh k s ih n
+> Texan's t eh k s ih n s
+15335a15353
+> Thousands th aw z ih n z
+15739c15757
+< Waco w ae k ow
+---
+> Waco w ey k ow
+15841a15860
+> Weekends w iy k eh n z
+16782a16802
+> acceptable eh k s eh p ax b ax l
+16833a16854
+> accounting ax k aw n ih ng
+16948a16970
+> address ax d r eh s
+17281a17304
+> already aa r d iy
+17315a17339
+> am m
+17709a17734
+> asked ae s t
+17847a17873
+> attorney ih t er n iy
+17919a17946
+> autopilot ao t ow p ay l ih t
+17960a17988
+> awfully ao f l iy
+18221a18250
+> basketball b ae s k ax b ao l
+18222a18252
+> basketball's b ae s k ax b ao l z
+18302a18333
+> become b ah k ah m
+18303a18335
+> becomes b iy k ah m z
+18344a18377
+> began b ax g en n
+18817c18850
+< bottle b aa t el
+---
+> bottle b aa t ax l
+19332,19333c19365,19367
+< camera's k ae m ax r ax z
+< cameras k ae m ax r ax z
+---
+> camera k ae m r ax
+> camera's k ae m r ax z
+> cameras k ae m r ax z
+19411a19446
+> capital k ae p ax l
+19505a19541
+> carrying k ae r ih ng
+20316a20353,20354
+> combination k aa m ih n ey sh ih n
+> combinations k aa m ih n ey sh ih n z
+20831a20870
+> contracts k aa n t r ae k s
+21010a21050
+> costs k ao s
+21062a21103
+> county k aw n iy
+21371a21413
+> cultural k ao l ch ax r ax l
+21372a21415
+> culturally k ao l ch ax r ax l iy
+21373a21417
+> culture k ao l ch er
+21375a21420
+> cultures k ao l ch er z
+21543a21589
+> data d ey t ax
+22097a22144
+> differently d ih f ax r ih n t l iy
+22972a23020
+> effects ax f eh k t s
+23016a23065
+> election ax l eh k sh ih n
+23018a23068
+> elections ax l eh k sh ih n z
+23052a23103
+> eleven ax l eh v ih n
+23242a23294
+> enjoyable ae n jh oy ax b ax l
+23248a23301
+> enjoys ae n jh oy z
+23293a23347
+> entire ih n t ay r
+23295a23350,23351
+> entirely ih n t ay r l iy
+> entirety ih n t ay r t iy
+23745a23802
+> extra eh k s t er
+23818a23876
+> facts f ae k s
+24508c24566
+< forever f ax r eh v er
+---
+> forever f er eh v er
+24514c24572
+< forget f ow r g eh t
+---
+> forget f er r g eh t
+24521a24580
+> forgot f er r g aa t
+24522a24582
+> forgotten f er r g aa t ax n
+24563a24624
+> forward f ow er d
+24680a24742
+> frightening f r ay t n ih ng
+24742a24805
+> full-time f ax l t ay m
+24862a24926
+> garage g r aa jh
+25218a25283
+> grandmother g r ae m ah dh er
+25790a25856
+> heavily hh eh v ax l iy
+25949a26016
+> history hh ih s t r iy
+26038a26106
+> honestly aa n ax s t l iy
+26039a26108
+> honesty aa n ax s t iy
+26099a26169
+> horror hh ow r
+26155a26226
+> houses hh aw z ih z
+26184c26255
+< huh-uh hh ah hh ah
+---
+> huh-uh ah hh ah
+26189c26260
+< hum-um hh m hh m
+---
+> hum-um ah m hh ah m
+26236a26308
+> hunting hh ah n ih ng
+26307a26380,26381
+> ideal ay d iy l
+> idealist ay d iy l ih s t
+26369a26444
+> imagine m ae jh ih n
+26628a26704
+> individuals ih n d ih v ih jh ax l z
+26968a27045
+> interest ih n t r ih s t
+27184a27262
+> it'd ih d
+27702a27781
+> lead l iy d
+28378a28458
+> mandatory m ae n d ih t ow r iy
+28885a28966
+> minute m ih n ih t
+29167a29249
+> mountains m aw t n z
+29317a29400
+> mysteries m ih s t r iy z
+29318a29402
+> mystery m ih s t r iy
+29470a29555
+> nervous n er v ih s
+29578,29580c29663,29665
+< nobody n ow b aa d iy
+< nobody'll n ow b aa d iy l
+< nobody's n ow b aa d iy z
+---
+> nobody n ow b ah d iy
+> nobody'll n ow b ah d iy l
+> nobody's n ow b ah d iy z
+29712a29798
+> nuclear n uw k l iy r
+29938a30025
+> onto aa n t ax
+30051a30139
+> originally ax r ih jh ax l iy
+30507a30596
+> particularly p er t ih k y ax l iy
+30755a30845
+> perfectly p er f ih k l iy
+30820a30911
+> personally p er s n ax l iy
+30915a31007
+> physically f ih z ih k l iy
+30986a31079
+> pilot p ay l ih t
+30987a31081
+> pilot's p ay l ih t s
+31227a31322
+> police p l iy s
+31513a31609
+> prefer p er f er
+31553a31650
+> prepare p r ax p ey r
+31578a31676
+> prescription p er s k r ih p sh ih n
+31579a31678
+> prescriptions p er s k r ih p sh ih n z
+31770a31870
+> products p r aa d ax k s
+31821a31922
+> projects p r aa jh eh k s
+31908a32010
+> protect p er t eh k t
+31909a32012
+> protected p er t eh k t ih d
+31911a32015
+> protection p er t eh k sh ih n
+31914a32019
+> protection p er t eh k t ih v
+32149a32255
+> quarter k ow r t er
+32414a32521
+> read r iy d
+32785a32893
+> rehabilitation r iy ax b ih l ih t ey sh ih n
+33150a33259
+> resource r ih s ow r s
+33151a33261
+> resources r iy s ow r s ih z
+33539c33649
+< roots r uh t s
+---
+> roots r uw t s
+33929a34040
+> science s ay n s
+34315a34427
+> seventy s eh v ih n iy
+34319,34320c34431,34432
+< severe s ax v iy r
+< severely s ax v iy r l iy
+---
+> severe s ih v iy r
+> severely s ih v iy r l iy
+35060a35173
+> software s ao f w ey r
+35083a35197
+> solid s ao l ih d
+35084a35199
+> solidly s ao l ih d l iy
+35750a35866
+> stood s t ih d
+35854a35971
+> strictly s t r ih k l iy
+35889c36006
+< stronger s t r ao ng er
+---
+> stronger s t r ao ng g er
+36192a36310,36311
+> supposed s p ow z
+> supposed s p ow s
+36510a36630
+> tastes t ey s
+36856a36977
+> thoroughly th er r l iy
+36866a36988
+> thousands th aw z ih n z
+37081c37203
+< toots t uh t s
+---
+> toots t uw t s
+37157a37280
+> toward t w ow r d
+37158a37282
+> towards t w ow r d z
+37564a37689
+> twenties t w eh n iy z
+37565a37691
+> twentieth t w eh n iy ih th
+37637a37764
+> unacceptable ah n ae k s eh p ax b ax l
+37728a37856
+> understand ah n d er s t ae n
+37860a37989
+> unless ih n l eh s
+38040a38170
+> use y uw z
+38049a38180
+> uses y uw z ih z
+38125a38257
+> various v ah r iy ih s
+38202a38335
+> versus v er s ih z
+38381c38514
+< wacko w ae k ow
+---
+> wacko w ey k ow
+38455c38588
+< wanna w aa n ax
+---
+> wanna w ah n ax
+38675c38808
+< whatnot w ah t n aa t
+---
+> whatnot w aa t n aa t
+38676a38810
+> whatsoever w aa t s ow eh v er
+38890c39024
+< wok w aa k
+---
+> wok w ao k
+38910a39045
+> wondering w ah n d r ih ng
--- a/examples/swbd/s0/local/eval2000_data_prep.sh
+++ b/examples/swbd/s0/local/eval2000_data_prep.sh
+#!/usr/bin/env bash
+
+# Hub-5 Eval 2000 data preparation
+# Author:  Arnab Ghoshal (Jan 2013)
+
+# To be run from one directory above this script.
+
+# The input is two directory names (possibly the same) containing the
+# 2000 Hub5 english evaluation test set and transcripts, which are
+# respectively: LDC2002S09  LDC2002T43
+# e.g. see
+# http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2002S09
+# http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002T43
+#
+# Example usage:
+# local/eval2000_data_prep_edin.sh /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/hub5/2000/transcr
+# The first directory ($sdir) contains the speech data, and the directory
+# $sdir/english/ must exist.
+# The second directory ($tdir) contains the transcripts, and the directory
+# $tdir/reference must exist; in particular we need the file
+# $tdir/reference/hub5e00.english.000405.stm
+
+if [ $# -ne 2 ]; then
+  echo "Usage: "`basename $0`" <speech-dir> <transcription-dir>"
+  echo "See comments in the script for more details"
+  exit 1
+fi
+
+sdir=$1
+tdir=$2
+[ ! -d $sdir/english ] \
+  && echo Expecting directory $sdir/english to be present && exit 1;
+[ -d $tdir/2000_hub5_eng_eval_tr ] \
+  && tdir=$tdir/2000_hub5_eng_eval_tr
+[ ! -d $tdir/reference ] \
+  && echo Expecting directory $tdir/reference to be present && exit 1;
+
+. ./path.sh
+
+dir=data/local/eval2000
+mkdir -p $dir
+
+find -L $sdir/english -iname '*.sph' | sort > $dir/sph.flist
+sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
+  > $dir/sph.scp
+
+# Get segments file...
+# segments file format is: utt-id side-id start-time end-time, e.g.:
+# sw02001-A_000098-001156 sw02001-A 0.98 11.56
+pem=$sdir/english/hub5e_00.pem
+[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
+# pem file has lines like:
+# en_4156 A unknown_speaker 301.85 302.48
+
+# we ignore the warnings below for now, although they seem to indicate some problems
+# with the data.
+grep -v ';;' $pem \
+  | awk '{
+           spk=$1"-"$2;
+           utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
+           print utt,spk,$4,$5;}' \
+  | sort -u | local/extend_segments.pl 0.1 > $dir/segments
+
+# stm file has lines like:
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
+# TODO(arnab): We should really be lowercasing this since the Edinburgh
+# recipe uses lowercase. This is not used in the actual scoring.
+grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
+  | awk '{
+           spk=$1"-"$2;
+           utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
+           printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
+  | sort > $dir/text.all
+
+# We'll use the stm file for sclite scoring.  There seem to be various errors
+# in the stm file that upset hubscr.pl, and we fix them here.
+sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' \
+  $tdir/reference/hub5e00.english.000405.stm >  $dir/stm
+cp $tdir/reference/en20000405_hub5.glm  $dir/glm
+
+# next line uses command substitution
+# Just checking that the segments are the same in pem vs. stm.
+! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
+   echo "Segments from pem file and stm file do not match." && exit 1;
+
+grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
+
+# side A - channel 1, side B - channel 2
+bash tools/sph2wav.sh --nj 16 $dir/sph.scp $dir/segments $dir/wav.scp
+
+# create an utt2spk file that assumes each conversation side is
+# a separate speaker.
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
+tools/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# cp $dir/segments $dir/segments.tmp
+# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
+#   $dir/segments.tmp > $dir/segments
+
+awk '{print $1}' $dir/wav_ori.scp \
+  | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
+               print "$1-$2 $1 $2\n"; ' \
+  > $dir/reco2file_and_channel || exit 1;
+
+dest=data/eval2000
+mkdir -p $dest
+for x in wav.scp text utt2spk spk2utt; do
+  cp $dir/$x $dest/$x
+done
+
+echo Data preparation and formatting completed for Eval 2000
+echo "(but not MFCC extraction)"
+
+tools/fix_data_dir.sh $dest
+if [ $(wc -l < $dest/wav.scp) -ne 80 ]; then
+  echo "$0: error: expected 80 lines in wav.scp, got $(wc -l < $dest/wav.scp)"
+  exit 1;
+fi
--- a/examples/swbd/s0/local/extend_segments.pl
+++ b/examples/swbd/s0/local/extend_segments.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+if (@ARGV != 1 || !($ARGV[0] =~ m/^-?\d+\.?\d*$/ &&  $ARGV[0] >= 0)) {
+  print STDERR "Usage: extend_segments.pl time-in-seconds <segments >segments.extended \n" .
+       "e.g. extend_segments.pl 0.25 <segments.1 >segments.2\n" .
+       "This command modifies a segments file, with lines like\n" .
+       " <utterance-id> <recording-id> <start-time> <end-time>\n" .
+       "by extending the beginning and end of each segment by a certain\n" .
+       "length of time.  This script makes sure the output segments do not\n" .
+       "overlap as a result of this time-extension, and that there are no\n" .
+       "negative times in the output.\n";
+  exit 1;
+}
+
+$extend = $ARGV[0];
+
+@all_lines = ();
+
+while (<STDIN>) {
+  chop;
+  @A = split(" ", $_);
+  if (@A != 4) {
+    die "invalid line in segments file: $_";
+  }
+  $line = @all_lines;  # current number of lines.
+  ($utt_id, $reco_id, $start_time, $end_time) = @A;
+
+  push @all_lines, [ $utt_id, $reco_id, $start_time, $end_time ]; # anonymous array.
+  if (! defined $lines_for_reco{$reco_id}) {
+    $lines_for_reco{$reco_id} = [ ];  # push new anonymous array.
+  }
+  push @{$lines_for_reco{$reco_id}}, $line;
+}
+
+foreach $reco_id (keys %lines_for_reco) {
+  $ref = $lines_for_reco{$reco_id};
+  @line_numbers = sort { ${$all_lines[$a]}[2] <=> ${$all_lines[$b]}[2] } @$ref;
+
+
+  {
+    # handle start of earliest segment as a special case.
+    $l0 = $line_numbers[0];
+    $tstart = ${$all_lines[$l0]}[2] - $extend;
+    if ($tstart < 0.0) { $tstart = 0.0; }
+    ${$all_lines[$l0]}[2] = $tstart;
+  }
+  {
+    # handle end of latest segment as a special case.
+    $lN = $line_numbers[$#line_numbers];
+    $tend = ${$all_lines[$lN]}[3] + $extend;
+    ${$all_lines[$lN]}[3] = $tend;
+  }
+  for ($i = 0; $i < $#line_numbers; $i++) {
+    $ln = $line_numbers[$i];
+    $ln1 = $line_numbers[$i+1];
+    $tend = ${$all_lines[$ln]}[3]; # end of earlier segment.
+    $tstart = ${$all_lines[$ln1]}[2]; # start of later segment.
+    if ($tend > $tstart) {
+      $utt1 = ${$all_lines[$ln]}[0];
+      $utt2 = ${$all_lines[$ln1]}[0];
+      print STDERR "Warning: for utterances $utt1 and $utt2, segments " .
+        "already overlap; leaving these times unchanged.\n";
+    } else {
+      $my_extend = $extend;
+      $max_extend =  0.5 * ($tstart - $tend);
+      if ($my_extend > $max_extend) { $my_extend = $max_extend; }
+      $tend += $my_extend;
+      $tstart -= $my_extend;
+      ${$all_lines[$ln]}[3] = $tend;
+      ${$all_lines[$ln1]}[2] = $tstart;
+    }
+  }
+}
+
+# leave the numbering of the lines unchanged.
+for ($l = 0; $l < @all_lines; $l++) {
+  $ref = $all_lines[$l];
+  ($utt_id, $reco_id, $start_time, $end_time) = @$ref;
+  printf("%s %s %.2f %.2f\n", $utt_id, $reco_id, $start_time, $end_time);
+}
+
+__END__
+
+# testing below.
+
+# ( echo a1 A 0 1; echo a2 A 3 4; echo b1 B 0 1; echo b2 B 2 3 ) | local/extend_segments.pl 1.0
+a1 A 0.00 2.00
+a2 A 2.00 5.00
+b1 B 0.00 1.50
+b2 B 1.50 4.00
+# ( echo a1 A 0 2; echo a2 A 1 3 ) | local/extend_segments.pl 1.0
+Warning: for utterances a1 and a2, segments already overlap; leaving these times unchanged.
+a1 A 0.00 2.00
+a2 A 1.00 4.00
+# ( echo a1 A 0 2; echo a2 A 5 6; echo a3 A 3 4 ) | local/extend_segments.pl 1.0
+a1 A 0.00 2.50
+a2 A 4.50 7.00
+a3 A 2.50 4.50
--- a/examples/swbd/s0/local/format_acronyms_dict.py
+++ b/examples/swbd/s0/local/format_acronyms_dict.py
+#!/usr/bin/env python3
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
+# convert acronyms in swbd dict to fisher convention
+# IBM to i._b._m.
+# BBC to b._b._c.
+# BBCs to b._b._c.s
+# BBC's to b._b._c.'s
+
+import argparse
+import re
+
+__author__ = "Minhua Wu"
+
+parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
+parser.add_argument("-i", "--input", help="Input lexicon", required=True)
+parser.add_argument("-o", "--output", help="Output lexicon", required=True)
+parser.add_argument(
+    "-L", "--Letter", help="Input single letter pronunciation", required=True
+)
+parser.add_argument("-M", "--Map", help="Output acronyms mapping", required=True)
+args = parser.parse_args()
+
+
+fin_lex = open(args.input, "r")
+fin_Letter = open(args.Letter, "r")
+fout_lex = open(args.output, "w")
+fout_map = open(args.Map, "w")
+
+# Initialise single letter dictionary
+dict_letter = {}
+for single_letter_lex in fin_Letter:
+    items = single_letter_lex.split()
+    dict_letter[items[0]] = single_letter_lex[len(items[0]) + 1 :].strip()
+fin_Letter.close()
+# print dict_letter
+
+for lex in fin_lex:
+    items = lex.split()
+    word = items[0]
+    lexicon = lex[len(items[0]) + 1 :].strip()
+    # find acronyms from words with only letters and '
+    pre_match = re.match(r"^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$", word)
+    if pre_match:
+        # find if words in the form of xxx's is acronym
+        if word[-2:] == "'s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
+            actual_word = word[:-2]
+            actual_lexicon = lexicon[:-2]
+            acronym_lexicon = ""
+            for w in actual_word:
+                acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
+            if acronym_lexicon.strip() == actual_lexicon:
+                acronym_mapped = ""
+                acronym_mapped_back = ""
+                for w in actual_word[:-1]:
+                    acronym_mapped = acronym_mapped + w.lower() + "._"
+                    acronym_mapped_back = acronym_mapped_back + w.lower() + " "
+                acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".'s"
+                acronym_mapped_back = (
+                    acronym_mapped_back + actual_word[-1].lower() + "'s"
+                )
+                fout_map.write(
+                    word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
+                )
+                fout_lex.write(acronym_mapped + " " + lexicon + "\n")
+            else:
+                fout_lex.write(lex)
+
+        # find if words in the form of xxxs is acronym
+        elif word[-1] == "s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
+            actual_word = word[:-1]
+            actual_lexicon = lexicon[:-2]
+            acronym_lexicon = ""
+            for w in actual_word:
+                acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
+            if acronym_lexicon.strip() == actual_lexicon:
+                acronym_mapped = ""
+                acronym_mapped_back = ""
+                for w in actual_word[:-1]:
+                    acronym_mapped = acronym_mapped + w.lower() + "._"
+                    acronym_mapped_back = acronym_mapped_back + w.lower() + " "
+                acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".s"
+                acronym_mapped_back = (
+                    acronym_mapped_back + actual_word[-1].lower() + "'s"
+                )
+                fout_map.write(
+                    word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
+                )
+                fout_lex.write(acronym_mapped + " " + lexicon + "\n")
+            else:
+                fout_lex.write(lex)
+
+        # find if words in the form of xxx (not ended with 's or s) is acronym
+        elif word.find("'") == -1 and word[-1] != "s":
+            acronym_lexicon = ""
+            for w in word:
+                acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
+            if acronym_lexicon.strip() == lexicon:
+                acronym_mapped = ""
+                acronym_mapped_back = ""
+                for w in word[:-1]:
+                    acronym_mapped = acronym_mapped + w.lower() + "._"
+                    acronym_mapped_back = acronym_mapped_back + w.lower() + " "
+                acronym_mapped = acronym_mapped + word[-1].lower() + "."
+                acronym_mapped_back = acronym_mapped_back + word[-1].lower()
+                fout_map.write(
+                    word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
+                )
+                fout_lex.write(acronym_mapped + " " + lexicon + "\n")
+            else:
+                fout_lex.write(lex)
+        else:
+            fout_lex.write(lex)
+
+    else:
+        fout_lex.write(lex)
--- a/examples/swbd/s0/local/map_acronyms_transcripts.py
+++ b/examples/swbd/s0/local/map_acronyms_transcripts.py
+#!/usr/bin/env python3
+
+# Copyright 2015  Minhua Wu
+# Apache 2.0
+
+# convert acronyms in swbd transcript to fisher convention
+# according to first two columns in the input acronyms mapping
+
+import argparse
+import re
+
+__author__ = "Minhua Wu"
+
+parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
+parser.add_argument("-i", "--input", help="Input transcripts", required=True)
+parser.add_argument("-o", "--output", help="Output transcripts", required=True)
+parser.add_argument("-M", "--Map", help="Input acronyms mapping", required=True)
+args = parser.parse_args()
+
+fin_map = open(args.Map, "r")
+dict_acronym = {}
+dict_acronym_noi = {}  # Mapping of acronyms without I, i
+for pair in fin_map:
+    items = pair.split("\t")
+    dict_acronym[items[0]] = items[1]
+    dict_acronym_noi[items[0]] = items[1]
+fin_map.close()
+del dict_acronym_noi["I"]
+del dict_acronym_noi["i"]
+
+
+fin_trans = open(args.input, "r")
+fout_trans = open(args.output, "w")
+for line in fin_trans:
+    items = line.split()
+    L = len(items)
+    # First pass mapping to map I as part of acronym
+    for i in range(L):
+        if items[i] == "I":
+            x = 0
+            while i - 1 - x >= 0 and re.match(r"^[A-Z]$", items[i - 1 - x]):
+                x += 1
+
+            y = 0
+            while i + 1 + y < L and re.match(r"^[A-Z]$", items[i + 1 + y]):
+                y += 1
+
+            if x + y > 0:
+                for bias in range(-x, y + 1):
+                    items[i + bias] = dict_acronym[items[i + bias]]
+
+    # Second pass mapping (not mapping 'i' and 'I')
+    for i in range(len(items)):
+        if items[i] in dict_acronym_noi.keys():
+            items[i] = dict_acronym_noi[items[i]]
+    sentence = " ".join(items[1:])
+    fout_trans.write(items[0] + " " + sentence.lower() + "\n")
+
+fin_trans.close()
+fout_trans.close()
--- a/examples/swbd/s0/local/swbd1_data_download.sh
+++ b/examples/swbd/s0/local/swbd1_data_download.sh
+#!/usr/bin/env bash
+
+# Switchboard-1 training data preparation customized for Edinburgh
+# Author:  Arnab Ghoshal (Jan 2013)
+
+# To be run from one directory above this script.
+
+## The input is some directory containing the switchboard-1 release 2
+## corpus (LDC97S62).  Note: we don't make many assumptions about how
+## you unpacked this.  We are just doing a "find" command to locate
+## the .sph files.
+
+. ./path.sh
+
+#check existing directories
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
+  exit 1;
+fi
+
+SWBD_DIR=$1
+
+dir=data/local/train
+mkdir -p $dir
+
+# Audio data directory check
+if [ ! -d $SWBD_DIR ]; then
+  echo "Error: run.sh requires a directory argument"
+  exit 1;
+fi
+
+# Trans directory check
+if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
+  (
+    cd $dir;
+    if [ ! -d swb_ms98_transcriptions ]; then
+      echo " *** Downloading trascriptions and dictionary ***"
+      wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
+      wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
+      tar -xf switchboard_word_alignments.tar.gz
+    fi
+  )
+else
+  echo "Directory with transcriptions exists, skipping downloading"
+  [ -f $dir/swb_ms98_transcriptions ] \
+    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
+fi
--- a/examples/swbd/s0/local/swbd1_data_prep.sh
+++ b/examples/swbd/s0/local/swbd1_data_prep.sh
+#!/usr/bin/env bash
+
+# Switchboard-1 training data preparation customized for Edinburgh
+# Author:  Arnab Ghoshal (Jan 2013)
+
+# To be run from one directory above this script.
+
+## The input is some directory containing the switchboard-1 release 2
+## corpus (LDC97S62).  Note: we don't make many assumptions about how
+## you unpacked this.  We are just doing a "find" command to locate
+## the .sph files.
+
+## The second input is optional, which should point to a directory containing
+## Switchboard transcriptions/documentations (specifically, the conv.tab file).
+## If specified, the script will try to use the actual speaker PINs provided
+## with the corpus instead of the conversation side ID (Kaldi default). We
+## will be using "find" to locate this file so we don't make any assumptions
+## on the directory structure. (Peng Qi, Aug 2014)
+
+. ./path.sh
+
+#check existing directories
+if [ $# != 1 -a $# != 2 ]; then
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+  exit 1;
+fi
+
+SWBD_DIR=$1
+
+dir=data/local/train
+mkdir -p $dir
+
+
+# Audio data directory check
+if [ ! -d $SWBD_DIR ]; then
+  echo "Error: run.sh requires a directory argument"
+  exit 1;
+fi
+
+# Option A: SWBD dictionary file check
+[ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
+  echo  "SWBD dictionary file does not exist" &&  exit 1;
+
+# find sph audio files
+find -L $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
+
+n=`cat $dir/sph.flist | wc -l`
+[ $n -ne 2435 ] && [ $n -ne 2438 ] && \
+  echo Warning: expected 2435 or 2438 data data files, found $n
+
+
+# (1a) Transcriptions preparation
+# make basic transcription file (add segments info)
+awk '{
+       name=substr($1,1,6); gsub("^sw","sw0",name); side=substr($1,7,1);
+       stime=$2; etime=$3;
+       printf("%s-%s_%06.0f-%06.0f",
+              name, side, int(100*stime+0.5), int(100*etime+0.5));
+       for(i=4;i<=NF;i++) printf(" %s", $i); printf "\n"
+}' $dir/swb_ms98_transcriptions/*/*/*-trans.text  > $dir/transcripts1.txt
+
+# test if trans. file is sorted
+export LC_ALL=C;
+sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
+
+# Remove SILENCE, <B_ASIDE> and <E_ASIDE>.
+
+# Note: we have [NOISE], [VOCALIZED-NOISE], [LAUGHTER], [SILENCE].
+# removing [SILENCE], and the <B_ASIDE> and <E_ASIDE> markers that mark
+# speech to somone; we will give phones to the other three (NSN, SPN, LAU).
+# There will also be a silence phone, SIL.
+# **NOTE: modified the pattern matches to make them case insensitive
+cat $dir/transcripts1.txt \
+  | perl -ane 's:\s\[SILENCE\](\s|$):$1:gi;
+               s/<B_ASIDE>//gi;
+               s/<E_ASIDE>//gi;
+               print;' \
+  | awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt
+
+# **NOTE: swbd1_map_words.pl has been modified to make the pattern matches
+# case insensitive
+local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text
+
+# format acronyms in text
+python3 local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
+  -M data/local/dict_nosp/acronyms.map
+mv $dir/text_map $dir/text
+
+# (1c) Make segment files from transcript
+#segments file format is: utt-id side-id start-time end-time, e.g.:
+#sw02001-A_000098-001156 sw02001-A 0.98 11.56
+awk '{
+       segment=$1;
+       split(segment,S,"[_-]");
+       side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4];
+       print segment " " audioname "-" side " " startf/100 " " endf/100
+}' < $dir/text > $dir/segments
+
+sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
+  > $dir/sph.scp
+
+# side A - channel 1, side B - channel 2
+bash tools/sph2wav.sh --nj 16 $dir/sph.scp $dir/segments $dir/wav.scp
+
+# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
+# to the file name sw02001 and the A, e.g.
+# sw02001-A  sw02001 A
+# In this case it's trivial, but in other corpora the information might
+# be less obvious.  Later it will be needed for ctm scoring.
+awk '{print $1}' $dir/wav_ori.scp \
+  | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
+               print "$1-$2 $1 $2\n"; ' \
+  > $dir/reco2file_and_channel || exit 1;
+
+awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \
+  || exit 1;
+sort -k 2 $dir/utt2spk | tools/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
+
+# We assume each conversation side is a separate speaker. This is a very
+# reasonable assumption for Switchboard. The actual speaker info file is at:
+# http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
+
+# Copy stuff into its final locations [this has been moved from the format_data
+# script]
+mkdir -p data/train
+for f in spk2utt utt2spk wav.scp text; do
+  cp data/local/train/$f data/train/$f || exit 1;
+done
+
+if [ $# == 2 ]; then # fix speaker IDs
+  find $2 -name conv.tab > $dir/conv.tab
+  local/swbd1_fix_speakerid.pl `cat $dir/conv.tab` data/train
+  tools/utt2spk_to_spk2utt.pl data/train/utt2spk.new > data/train/spk2utt.new
+  # patch files
+  for f in spk2utt utt2spk text segments; do
+    cp data/train/$f data/train/$f.old || exit 1;
+    cp data/train/$f.new data/train/$f || exit 1;
+  done
+  rm $dir/conv.tab
+fi
+
+echo Switchboard-1 data preparation succeeded.
+
+utils/fix_data_dir.sh data/train
--- a/examples/swbd/s0/local/swbd1_fix_speakerid.pl
+++ b/examples/swbd/s0/local/swbd1_fix_speakerid.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+# Author: Peng Qi (pengqi@cs.stanford.edu)
+# This script maps Switchboard speaker IDs to the true physical speakers
+# and fixes the utterances IDs accordingly. Expected to be run one level of
+# directory above.
+
+sub trim {
+    (my $s = $_[0]) =~ s/^\s+|\s+$//g;
+    return $s;
+}
+
+if ($#ARGV != 1) {
+  print "Usage: swbd1_fix_speakerid.pl <swbd-conv-tab-file> <data-dir>\n";
+  print "E.g.:  swbd1_fix_speakerid.pl /datasets/SWBD1Transcripts/tables/conv.tab data/train\n";
+}
+
+$tab_file = $ARGV[0];
+$dir = $ARGV[1];
+
+%conv_to_spk = ();
+
+open(my $conv_tab, '<', $tab_file) or die "Could not open '$tab_file' $!\n";
+
+while (my $line = <$conv_tab>) {
+  chomp $line;
+
+  my @fields = split "," , $line;
+  #$fields[0] = trim($fields[0]);
+  $fields[2] = trim($fields[2]);
+  $fields[3] = trim($fields[3]);
+  $conv_to_spk{'sw0' . $fields[0] . '-A'} = $fields[2];
+  $conv_to_spk{'sw0' . $fields[0] . '-B'} = $fields[3];
+}
+
+close($conv_tab);
+
+# fix utt2spk
+
+%missingconv = ();
+
+open(my $utt2spk, '<', $dir . '/utt2spk') or die "Could not open '$dir/utt2spk' $!\n";
+open(my $utt2spk_new, '>', $dir . '/utt2spk.new');
+
+while (my $line = <$utt2spk>) {
+  chomp $line;
+
+  my @fields = split " " , $line;
+  my $convid = substr $fields[0], 0, 9;
+
+  if (exists $conv_to_spk{ $convid }) {
+    my $spkid = $conv_to_spk{ $convid };
+    $spkid = "sw" . $spkid;
+    my $newuttid = $spkid . '-' . (substr $fields[0], 2);
+
+    print $utt2spk_new "$newuttid $spkid\n";
+  } else {
+    my $convid = substr $convid, 3, 4;
+    $missingconv{$convid} = 1;
+
+    print $utt2spk_new $fields[0]." ".$fields[1]."\n";
+  }
+}
+
+close($utt2spk);
+close($utt2spk_new);
+
+foreach my $conv (keys %missingconv) {
+  print "Warning: Conversation ID '$conv' not found in conv.tab, retaining old speaker IDs\n"
+}
+
+# fix segments and text
+
+foreach my $file ('segments','text') {
+  open(my $oldfile, '<', "$dir/$file") or die "Could not open '$dir/$file' $!\n";
+  open(my $newfile, '>', "$dir/$file.new");
+
+  while (my $line = <$oldfile>) {
+    chomp $line;
+
+    my $convid = substr $line, 0, 9;
+    if (exists $conv_to_spk{$convid}) {
+      my $spkid = $conv_to_spk{$convid};
+      print $newfile "sw$spkid-" . (substr $line, 2) . "\n";
+    } else {
+      print $newfile "$line\n";
+    }
+  }
+}
--- a/examples/swbd/s0/local/swbd1_map_words.pl
+++ b/examples/swbd/s0/local/swbd1_map_words.pl
+#!/usr/bin/env perl
+
+# Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern
+# matches case-insensitive --Arnab (Jan 2013)
+
+if ($ARGV[0] eq "-f") {
+  shift @ARGV;
+  $field_spec = shift @ARGV;
+  if ($field_spec =~ m/^\d+$/) {
+    $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+  }
+  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
+    if ($1 ne "") {
+      $field_begin = $1 - 1;    # Change to zero-based indexing.
+    }
+    if ($2 ne "") {
+      $field_end = $2 - 1;      # Change to zero-based indexing.
+    }
+  }
+  if (!defined $field_begin && !defined $field_end) {
+    die "Bad argument to -f option: $field_spec";
+  }
+}
+
+
+while (<>) {
+  @A = split(" ", $_);
+  for ($n = 0; $n < @A; $n++) {
+    $a = $A[$n];
+    if ( (!defined $field_begin || $n >= $field_begin)
+         && (!defined $field_end || $n <= $field_end)) {
+      # e.g. [LAUGHTER-STORY] -> STORY;
+      $a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i;
+      # $1 and $3 relate to preserving trailing "-"
+      $a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note,
+      # 1st part may include partial-word stuff, which we process further below,
+      # e.g. [LEM[GUINI]-/LINGUINI]
+      # the (|\_) at the end is to accept and preserve trailing -'s.
+      $a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote;
+      # let the leading - be optional on input, as sometimes omitted.
+      $a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-;
+      # let the trailing - be optional on input, as sometimes omitted.
+      $a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
+      # which is a  mistake in the input.
+      $a =~ s:^\{(.+)\}$:$1:;                 # e.g. {YUPPIEDOM} -> YUPPIEDOM
+      $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
+      $a =~ s:_\d$::;                         # e.g. THEM_1 -> THEM
+    }
+    $A[$n] = $a;
+  }
+  print join(" ", @A) . "\n";
+}
--- a/examples/swbd/s0/local/swbd1_prepare_dict.sh
+++ b/examples/swbd/s0/local/swbd1_prepare_dict.sh
+#!/usr/bin/env bash
+
+# Formatting the Mississippi State dictionary for use in Edinburgh. Differs
+# from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
+
+# To be run from one directory above this script.
+
+. ./path.sh
+
+#check existing directories
+[ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1;
+
+srcdir=data/local/train  # This is where we downloaded some stuff..
+dir=data/local/dict_nosp
+mkdir -p $dir
+srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
+
+# assume swbd_p1_data_prep.sh was done already.
+[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
+
+cp $srcdict $dir/lexicon0.txt || exit 1;
+patch <local/dict.patch $dir/lexicon0.txt || exit 1;
+
+#(2a) Dictionary preparation:
+# Pre-processing (remove comments)
+grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort > $dir/lexicon1.txt || exit 1;
+
+cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
+  grep -v sil > $dir/nonsilence_phones.txt  || exit 1;
+
+( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
+
+echo sil > $dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone.
+echo -n >$dir/extra_questions.txt
+
+cp local/MSU_single_letter.txt $dir/
+# Add to the lexicon the silences, noises etc.
+# Add single letter lexicon
+# The original swbd lexicon does not have precise single letter lexicion
+# e.g. it does not have entry of W
+( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \
+  echo '[laughter] lau'; echo '<unk> spn' ) \
+  | cat - $dir/lexicon1.txt $dir/MSU_single_letter.txt  > $dir/lexicon2.txt || exit 1;
+
+# Map the words in the lexicon.  That is-- for each word in the lexicon, we map it
+# to a new written form.  The transformations we do are:
+# remove laughter markings, e.g.
+# [LAUGHTER-STORY] -> STORY
+# Remove partial-words, e.g.
+# -[40]1K W AH N K EY
+# becomes -1K
+# and
+# -[AN]Y IY
+# becomes
+# -Y
+# -[A]B[OUT]- B
+# becomes
+# -B-
+# Also, curly braces, which appear to be used for "nonstandard"
+# words or non-words, are removed, e.g.
+# {WOLMANIZED} W OW L M AX N AY Z D
+# -> WOLMANIZED
+# Also, mispronounced words, e.g.
+#  [YEAM/YEAH] Y AE M
+# are changed to just e.g. YEAM, i.e. the orthography
+# of the mispronounced version.
+# Note-- this is only really to be used in training.  The main practical
+# reason is to avoid having tons of disambiguation symbols, which
+# we otherwise would get because there are many partial words with
+# the same phone sequences (most problematic: S).
+# Also, map
+# THEM_1 EH M -> THEM
+# so that multiple pronunciations just have alternate entries
+# in the lexicon.
+
+local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \
+  > $dir/lexicon3.txt || exit 1;
+
+python3 local/format_acronyms_dict.py -i $dir/lexicon3.txt -o $dir/lexicon4.txt \
+  -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map
+cat $dir/acronyms_raw.map | sort -u > $dir/acronyms.map
+
+( echo 'i ay' )| cat - $dir/lexicon4.txt | tr '[A-Z]' '[a-z]' | sort -u > $dir/lexicon5.txt
+
+pushd $dir >&/dev/null
+ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
+popd >&/dev/null
+rm $dir/lexiconp.txt 2>/dev/null
+echo Prepared input dictionary and phone-sets for Switchboard phase 1.
--- a/examples/swbd/s0/path.sh
+++ b/examples/swbd/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/swbd/s0/run.sh
+++ b/examples/swbd/s0/run.sh
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+
+nj=16
+feat_dir=raw_wav
+data_type=shard # raw or shard
+num_utts_per_shard=1000
+prefetch=100
+# bpemode (unigram or bpe)
+nbpe=2000
+bpemode=bpe
+
+# data directory
+swbd1_dir=/home/backup_nfs2/hlyu/swbd/LDC97S62
+eval2000_dir="/home/backup_nfs2/hlyu/swbd/LDC2002S09/hub5e_00 /home/backup_nfs2/hlyu/swbd/LDC2002T43"
+
+train_set=train_nodup
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=10
+decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  # Data preparation
+  local/swbd1_data_download.sh ${swbd1_dir}
+  local/swbd1_prepare_dict.sh
+  local/swbd1_data_prep.sh ${swbd1_dir}
+  local/eval2000_data_prep.sh ${eval2000_dir}
+  # process the train set by
+  # 1) convert lower to upper
+  # 2) remove ._._ -1 symbols from text
+  # 3) subset training set and dev set
+  # 4) remove duplicated utterances
+  cp data/train/text data/train/text.org
+  paste -d" " <(cut -f 1 -d" " data/train/text.org) \
+    <(cut -f 2- -d" " data/train/text.org | tr "[:lower:]" "[:upper:]") > data/train/text
+  sed -i 's/\._/ /g; s/\.//g; s/THEM_1/THEM/g' data/train/text
+  tools/subset_data_dir.sh --first data/train 4000 data/train_dev  # 5hr 6min
+  n=$(($(wc -l < data/train/text) - 4000))
+  tools/subset_data_dir.sh --last data/train ${n} data/train_nodev
+  tools/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup
+  # process eval2000 set by
+  # 1) remove tags (%AH) (%HESITATION) (%UH)
+  # 2) remove <B_ASIDE> <E_ASIDE>
+  # 3) remove "(" or ")"
+  # 4) remove file with empty text
+  cp data/eval2000/text data/eval2000/text.org
+  paste -d "" \
+      <(cut -f 1 -d" " data/eval2000/text.org) \
+      <(awk '{$1=""; print toupper($0)}' data/eval2000/text.org \
+      | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' \
+      | sed -e "s/(//g" -e "s/)//g") \
+      | sed -e 's/\s\+/ /g' > data/eval2000/text.org2
+  awk -F ' ' '{if(length($2) != 0) print $0}' data/eval2000/text.org2 > data/eval2000/text
+  tools/fix_data_dir.sh data/eval2000
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  # For wav feature, just copy the data. Fbank extraction is done in training
+  mkdir -p ${feat_dir}
+  for x in ${train_set} train_dev eval2000; do
+    cp -r data/${x} ${feat_dir}
+  done
+  tools/compute_cmvn_stats.py --num_workers 16 --train_config ${train_config} \
+    --in_scp data/${train_set}/wav.scp \
+    --out_cmvn ${feat_dir}/${train_set}/global_cmvn
+
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char/
+
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+
+  tools/spm_train --input=data/lang_char/input.txt \
+    --vocab_size=${nbpe} \
+    --character_coverage=1.0 \
+    --model_type=${bpemode} \
+    --model_prefix=${bpemodel} \
+    --input_sentence_size=100000000 \
+    --user_defined_symbols="[LAUGHTER],[NOISE],[VOCALIZED-NOISE]"
+  tools/spm_encode --model=${bpemodel}.model \
+    --output_format=piece < data/lang_char/input.txt | \
+    tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+
+  num_token=$(cat ${dict} | wc -l)
+  echo "<sos/eos> ${num_token}" >> ${dict} # <eos>
+  wc -l ${dict}
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Prepare data, prepare required format"
+  for x in ${train_set} train_dev eval2000; do
+    if [ ${data_type} == "shard" ]; then
+      tools/make_shard_list.py --num_utts_per_shard ${num_utts_per_shard} \
+        --num_threads ${nj} ${feat_dir}/${x}/wav.scp ${feat_dir}/${x}/text \
+        $(realpath ${feat_dir}/${x}/shards) ${feat_dir}/${x}/data.list
+    else
+      tools/make_raw_list.py ${feat_dir}/${x}/wav.scp ${feat_dir}/${x}/text \
+        ${feat_dir}/${x}/data.list
+    fi
+  done
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  # Training
+  mkdir -p ${dir}
+  INIT_FILE=${dir}/ddp_init
+  # You had better rm it manually before you start run.sh on first node.
+  # rm -f $INIT_FILE # delete old one before starting
+  init_method=file://$(readlink -f ${INIT_FILE})
+  echo "$0: init method is $init_method"
+  # The number of gpus runing on each node/machine
+  num_gpus=$(echo ${CUDA_VISIBLE_DEVICES} | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="nccl"
+  # The total number of processes/gpus, so that the master knows
+  # how many workers to wait for.
+  # More details about ddp can be found in
+  # https://pytorch.org/tutorials/intermediate/dist_tuto.html
+  world_size=`expr ${num_gpus} \* ${num_nodes}`
+  echo "total gpus is: ${world_size}"
+  cmvn_opts=
+  ${cmvn} && cp ${feat_dir}/${train_set}/global_cmvn ${dir}
+  ${cmvn} && cmvn_opts="--cmvn ${dir}/global_cmvn"
+  # train.py will write $train_config to $dir/train.yaml with model input
+  # and output dimension, train.yaml will be used for inference or model
+  # export later
+  for ((i = 0; i < ${num_gpus}; ++i)); do
+  {
+    gpu_id=$(echo ${CUDA_VISIBLE_DEVICES} | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr ${node_rank} \* ${num_gpus} + ${i}`
+    python wenet/bin/train.py --gpu ${gpu_id} \
+      --config ${train_config} \
+      --data_type ${data_type} \
+      --symbol_table ${dict} \
+      --prefetch ${prefetch} \
+      --bpe_model ${bpemodel}.model \
+      --train_data ${feat_dir}/${train_set}/data.list \
+      --cv_data ${feat_dir}/train_dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir ${dir} \
+      --ddp.init_method ${init_method} \
+      --ddp.world_size ${world_size} \
+      --ddp.rank ${rank} \
+      --ddp.dist_backend ${dist_backend} \
+      --num_workers 4 \
+      ${cmvn_opts} \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=${dir}/avg_${average_num}.pt
+    echo "do model average and final checkpoint is ${decode_checkpoint}"
+    python wenet/bin/average_model.py \
+      --dst_model ${decode_checkpoint} \
+      --src_path $dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  reverse_weight=0.0
+  for mode in ${decode_modes}; do
+  {
+    test_dir=${dir}/test_${mode}
+    mkdir -p ${test_dir}
+    python wenet/bin/recognize.py --gpu 0 \
+      --mode $mode \
+      --config $dir/train.yaml \
+      --data_type $data_type \
+      --test_data $feat_dir/eval2000/data.list \
+      --checkpoint $decode_checkpoint \
+      --beam_size 10 \
+      --batch_size 1 \
+      --penalty 0.0 \
+      --dict $dict \
+      --bpe_model $bpemodel.model \
+      --ctc_weight $ctc_weight \
+      --reverse_weight $reverse_weight \
+      --result_file $test_dir/text \
+    ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+    sed -i.bak -r 's/<blank> //g' ${test_dir}/text
+    mv ${test_dir}/text ${test_dir}/text.bak2
+    tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+        < ${test_dir}/text.bak2 | sed -e "s/▁/ /g" > ${test_dir}/text
+    python tools/compute-wer.py --char=1 --v=1 \
+      $feat_dir/eval2000/text $test_dir/text > $test_dir/wer
+  }
+  done
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
--- a/examples/swbd/s0/tools
+++ b/examples/swbd/s0/tools
+../../../tools/
\ No newline at end of file
--- a/examples/swbd/s0/wenet
+++ b/examples/swbd/s0/wenet
+../../../wenet/
\ No newline at end of file
--- a/examples/tedlium3/s0/README.md
+++ b/examples/tedlium3/s0/README.md
+# Performance Record
+
+## Conformer Result
+
+* Feature info: using fbank feature, dither, cmvn, without speed perturb (not supported segments yet)
+* Training info: lr 0.001, batch size 20, 8 gpu, acc_grad 1, 240 epochs, dither 0.1
+* Decoding info: ctc_weight 0.5, average_num 10
+
+
+| decoding mode       | Dev WER | Test WER |
+|---------------------|---------|----------|
+| attention rescoring | 9.54%   | 8.66%    |
\ No newline at end of file
--- a/examples/tedlium3/s0/conf/train_conformer.yaml
+++ b/examples/tedlium3/s0/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 10
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 20
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
--- a/examples/tedlium3/s0/local/download_data.sh
+++ b/examples/tedlium3/s0/local/download_data.sh
+#!/usr/bin/env bash
+
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  John Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+mkdir -p db
+
+cd db  ### Note: the rest of this script is executed from the directory 'db'.
+
+# TED-LIUM database:
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
+  if [ ! -e TEDLIUM_release-3 ]; then
+    ln -sf /export/corpora5/TEDLIUM_release-3
+  fi
+  echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3"
+else
+  if [ ! -e TEDLIUM_release-3 ]; then
+    echo "$0: downloading TEDLIUM_release2 data (it won't re-download if it was already downloaded.)"
+    # the following command won't re-get it if it's already there
+    # because of the --continue switch.
+    wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1
+    tar xf "TEDLIUM_release-3.tgz"
+  else
+    echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
+  fi
+fi
+
+
+num_sph=$(find -L TEDLIUM_release-3/legacy -name '*.sph' | wc -l)
+# We mainly use TED-LIUM 3 "legacy" distribution, on which the dev and test datasets are the same as in TED-LIUM 2 (and TED-LIUM1).
+# It contains 2351 sph files for training and 19 sph files for dev/test (total 2370).
+# Because the "legacy" contains symbolic links to "data", we use `find -L`.
+if [ "$num_sph" != 2370 ]; then
+    echo "$0: expected to find 2370 .sph files in the directory db/TEDLIUM_release3/legacy, found $num_sph"
+  exit 1
+fi
+
+exit 0
+
--- a/examples/tedlium3/s0/local/join_suffix.py
+++ b/examples/tedlium3/s0/local/join_suffix.py
+#!/usr/bin/env python3
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+
+import sys
+
+# This script joins together pairs of split-up words like "you 're" -> "you're".
+# The TEDLIUM transcripts are normalized in a way that's not traditional for
+# speech recognition.
+
+prev_line = ""
+for line in sys.stdin:
+    if line == prev_line:
+        continue
+    items = line.split()
+    new_items = []
+    i = 0
+    while i < len(items):
+        if i < len(items) - 1 and items[i + 1][0] == "'":
+            new_items.append(items[i] + items[i + 1])
+            i = i + 1
+        else:
+            new_items.append(items[i])
+        i = i + 1
+    print(" ".join(new_items))
+    prev_line = line