add new model

764b3a75 · Sugon_ldc · 764b3a75 · 764b3a75 · 764b3a75 · 764b3a75
Commit 764b3a75 authored Jun 07, 2023 by Sugon_ldc
20 changed files
--- a/examples/chime4/s0/local/cstr_ndx2flist.pl
+++ b/examples/chime4/s0/local/cstr_ndx2flist.pl
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 12/1/12
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
+# /group/corpora/public/wsjcam0/data on DICE machines.
+# It outputs a list of absolute pathnames.
+
+$wsj_dir = $ARGV[0];
+
+while(<STDIN>){
+  if(m/^;/){ next; } # Comment.  Ignore it.
+  else {
+    m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+    $filename = $2; # as a subdirectory of the distributed disk.
+    if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
+    $filename = "$wsj_dir/$filename";
+    if (-e $filename) {
+      print "$filename\n";
+    } else {
+      print STDERR "File $filename found in the index but not on disk\n";
+    }
+  }
+}
--- a/examples/chime4/s0/local/find_noisy_transcripts.pl
+++ b/examples/chime4/s0/local/find_noisy_transcripts.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+#
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){
+    chop;
+    $uttid_orig = $_;
+    $uttid = substr $uttid_orig, 0, 8;
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid_orig $utt2trans{$uttid}\n";
+    }
+}
--- a/examples/chime4/s0/local/find_transcripts.pl
+++ b/examples/chime4/s0/local/find_transcripts.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+#
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){
+    chop;
+    $uttid = $_;
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid $utt2trans{$uttid}\n";
+    }
+}
--- a/examples/chime4/s0/local/flist2scp.pl
+++ b/examples/chime4/s0/local/flist2scp.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# takes in a file list with lines like
+# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# and outputs an scp in kaldi format with lines like
+# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# (the first thing is the utterance-id, which is the same as the basename of the file.
+
+
+while(<>){
+    m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
+    $id = $1;
+    $id =~ tr/A-Z/a-z/;  # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
+    print "$id $_";
+}
--- a/examples/chime4/s0/local/ndx2flist.pl
+++ b/examples/chime4/s0/local/ndx2flist.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+#and as command-line arguments it takes the names of the WSJ disk locations, e.g.:
+#/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1  ... etc.
+# It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with
+# /mnt/matylda2/data/WSJ0/11-1.1.
+# It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with
+# uppercase rather than lower case filenames.
+
+foreach $fn (@ARGV) {
+    $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn\n";
+    $disk_id=$1;
+    $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1
+    $fn =~ s:/$::; # Remove final slash, just in case it is present.
+    $disk2fn{$disk_id} = $fn;
+}
+
+while(<STDIN>){
+    if(m/^;/){ next; } # Comment.  Ignore it.
+    else {
+      m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+      $disk=$1;
+      if(!defined $disk2fn{$disk}) {
+          die "Disk id $disk not found";
+      }
+      $filename = $2; # as a subdirectory of the distributed disk.
+      if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) {
+          # The disk 13-16.1 has been uppercased for some reason, on the
+          # BUT system.  This is a fix specifically for that case.
+          $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames.  Why?
+      }
+      print "$disk2fn{$disk}/$filename\n";
+  }
+}
--- a/examples/chime4/s0/local/normalize_transcript.pl
+++ b/examples/chime4/s0/local/normalize_transcript.pl
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This takes data from the standard input that's unnormalized transcripts in the format
+# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise]
+# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam]
+# and outputs normalized transcripts.
+# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
+
+@ARGV == 1 ||  die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+    foreach $w (split (" ",$trans)) {
+        $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. .
+        $w =~ s:\\::g;      # Remove backslashes.  We don't need the quoting.
+        $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts.
+        $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts.
+        if($w =~ m:^\[\<\w+\]$:  || # E.g. [<door_slam], this means a door slammed in the preceding word. Delete.
+           $w =~ m:^\[\w+\>\]$:  ||  # E.g. [door_slam>], this means a door slammed in the next word.  Delete.
+           $w =~ m:\[\w+/\]$: ||  # E.g. [phone_ring/], which indicates the start of this phenomenon.
+           $w =~ m:\[\/\w+]$: ||  # E.g. [/phone_ring], which indicates the end of this phenomenon.
+           $w eq "~" ||        # This is used to indicate truncation of an utterance.  Not a word.
+           $w eq ".") {      # "." is used to indicate a pause.  Silence is optional anyway so not much
+                             # point including this in the transcript.
+            next; # we won't print this word.
+        } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
+            print " $noise_word";
+        } elsif($w =~ m:^\<([\w\']+)\>$:) {
+            # e.g. replace <and> with and.  (the <> means verbal deletion of a word).. but it's pronounced.
+            print " $1";
+        } elsif($w eq "--DASH") {
+            print " -DASH";  # This is a common issue; the CMU dictionary has it as -DASH.
+#        } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
+#            print " $1 -DASH";
+        } else {
+            print " $w";
+        }
+    }
+    print "\n";
+}
--- a/examples/chime4/s0/local/real_enhan_chime4_data_prep.sh
+++ b/examples/chime4/s0/local/real_enhan_chime4_data_prep.sh
+#!/usr/bin/env bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+# Config:
+eval_flag=true # make it true when the evaluation data are released
+
+. tools/parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <enhancement-name> <enhanced-speech-directory>\n\n" `basename $0`
+  echo "The argument should be a the directory that only contains enhanced speech data."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+enhan=$1
+audio_dir=$2
+
+dir=$PWD/data/chime4/local
+mkdir -p $dir
+local=$PWD/local
+utils=$PWD/utils
+odir=$PWD/data/chime4
+
+if $eval_flag; then
+list_set="tr05_real_$enhan dt05_real_$enhan et05_real_$enhan"
+else
+list_set="tr05_real_$enhan dt05_real_$enhan"
+fi
+
+cd $dir
+
+find $audio_dir/ -name '*.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_$enhan.flist
+find $audio_dir/ -name '*.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_$enhan.flist
+if $eval_flag; then
+find $audio_dir/ -name '*.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_$enhan.flist
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+done
+
+#make a transcription from dot
+cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> tr05_real_$enhan.ids
+cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_$enhan.txt
+paste -d" " tr05_real_$enhan.ids tr05_real_$enhan.txt | sort -k 1 > tr05_real_$enhan.trans1
+cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> dt05_real_$enhan.ids
+cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_$enhan.txt
+paste -d" " dt05_real_$enhan.ids dt05_real_$enhan.txt | sort -k 1 > dt05_real_$enhan.trans1
+if $eval_flag; then
+cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> et05_real_$enhan.ids
+cat et05_real.dot | sed -e 's/(.*)//' > et05_real_$enhan.txt
+paste -d" " et05_real_$enhan.ids et05_real_$enhan.txt | sort -k 1 > et05_real_$enhan.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  mkdir -p $odir/$x
+  cp ${x}_wav.scp $odir/$x/wav.scp || exit 1;
+  cp ${x}.txt     $odir/$x/text    || exit 1;
+done
+
+echo "Data preparation succeeded"
--- a/examples/chime4/s0/local/real_noisy_chime4_data_prep.sh
+++ b/examples/chime4/s0/local/real_noisy_chime4_data_prep.sh
+#!/usr/bin/env bash
+
+set -eu
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+# Modified to use data of six channels
+# Szu-Jui Chen 09/29/2017
+
+# Config:
+eval_flag=true # make it true when the evaluation data are released
+
+. tools/parse_options.sh || exit 1
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level Chime4 directory."
+  echo "It is assumed that there will be a 'data' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+audio_dir=$1/data/audio/16kHz/isolated/
+trans_dir=$1/data/transcriptions
+
+echo "extract all channels (CH[1-6].wav) for noisy data"
+
+dir=$PWD/data/chime4/local
+mkdir -p $dir
+local=$PWD/local
+
+if $eval_flag; then
+list_set="tr05_real_noisy dt05_real_noisy et05_real_noisy"
+else
+list_set="tr05_real_noisy dt05_real_noisy"
+fi
+
+cd $dir
+
+find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_noisy.flist
+find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_noisy.flist
+if $eval_flag; then
+find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_noisy.flist
+fi
+
+# make a dot format from json annotation files
+cp $trans_dir/tr05_real.dot_all tr05_real.dot
+cp $trans_dir/dt05_real.dot_all dt05_real.dot
+if $eval_flag; then
+cp $trans_dir/et05_real.dot_all et05_real.dot
+fi
+
+# make a scp temporary file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.id.temp
+    cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
+    cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
+    cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
+    paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
+done
+
+#make a transcription from dot
+cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> tr05_real_noisy.ids
+cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_noisy.txt
+paste -d" " tr05_real_noisy.ids tr05_real_noisy.txt | \
+awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
+sort -k 1 > tr05_real_noisy.trans1
+cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> dt05_real_noisy.ids
+cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_noisy.txt
+paste -d" " dt05_real_noisy.ids dt05_real_noisy.txt | \
+awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
+sort -k 1 > dt05_real_noisy.trans1
+if $eval_flag; then
+cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_REAL"}'> et05_real_noisy.ids
+cat et05_real.dot | sed -e 's/(.*)//' > et05_real_noisy.txt
+paste -d" " et05_real_noisy.ids et05_real_noisy.txt | \
+awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
+sort -k 1 > et05_real_noisy.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
+  cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
+  paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  sort ${x}_wav.scp.temp > ${x}_wav.scp
+  mkdir -p ../../chime4/$x
+  cp ${x}_wav.scp ../../chime4/$x/wav.scp || exit 1;
+  cp ${x}.txt     ../../chime4/$x/text    || exit 1;
+done
+
+# clean up temp files
+rm *.temp
+rm *.part{1,2}
+
+echo "Data preparation succeeded"
--- a/examples/chime4/s0/local/simu_enhan_chime4_data_prep.sh
+++ b/examples/chime4/s0/local/simu_enhan_chime4_data_prep.sh
+#!/usr/bin/env bash
+set -eu
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+# Config:
+eval_flag=true # make it true when the evaluation data are released
+
+. tools/parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <enhancement-name> <enhanced-speech-directory>\n\n" `basename $0`
+  echo "The argument should be a the directory that only contains enhanced speech data."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+enhan=$1
+audio_dir=$2
+
+dir=$PWD/data/chime4/local
+mkdir -p $dir
+local=$PWD/local
+utils=$PWD/utils
+odir=$PWD/data/chime4
+
+if $eval_flag; then
+list_set="tr05_simu_$enhan dt05_simu_$enhan et05_simu_$enhan"
+else
+list_set="tr05_simu_$enhan dt05_simu_$enhan"
+fi
+
+cd $dir
+
+find $audio_dir/ -name '*.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_$enhan.flist
+find $audio_dir/ -name '*.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_$enhan.flist
+if $eval_flag; then
+find $audio_dir/ -name '*.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_$enhan.flist
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+done
+
+# make a transcription from dot
+# simulation training data extract dot file from original WSJ0 data
+# since it is generated from these data
+if [ ! -e dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
+  exit 1;
+fi
+cat tr05_simu_${enhan}_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \
+    | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_$enhan.txt
+cat tr05_simu_${enhan}_wav.scp | cut -f 1 -d" " > tr05_simu_$enhan.ids
+paste -d" " tr05_simu_$enhan.ids tr05_simu_$enhan.txt | sort -k 1 > tr05_simu_$enhan.trans1
+# dt05 and et05 simulation data are generated from the CHiME4 booth recording
+# and we use CHiME4 dot files
+cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> dt05_simu_$enhan.ids
+cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_$enhan.txt
+paste -d" " dt05_simu_$enhan.ids dt05_simu_$enhan.txt | sort -k 1 > dt05_simu_$enhan.trans1
+if $eval_flag; then
+cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> et05_simu_$enhan.ids
+cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_$enhan.txt
+paste -d" " et05_simu_$enhan.ids et05_simu_$enhan.txt | sort -k 1 > et05_simu_$enhan.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  mkdir -p $odir/$x
+  cp ${x}_wav.scp $odir/$x/wav.scp || exit 1;
+  cp ${x}.txt     $odir/$x/text    || exit 1;
+done
+
+echo "Data preparation succeeded"
--- a/examples/chime4/s0/local/simu_noisy_chime4_data_prep.sh
+++ b/examples/chime4/s0/local/simu_noisy_chime4_data_prep.sh
+#!/usr/bin/env bash
+set -eu
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+# Modified to use data of six channels
+# Szu-Jui Chen 09/29/2017
+
+# Config:
+eval_flag=true # make it true when the evaluation data are released
+
+. tools/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level Chime4 directory."
+  echo "It is assumed that there will be a 'data' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+audio_dir=$1/data/audio/16kHz/isolated/
+trans_dir=$1/data/transcriptions
+
+echo "extract all channels (CH[1-6].wav) for noisy data"
+
+dir=$PWD/data/chime4/local
+mkdir -p $dir
+local=$PWD/local
+utils=$PWD/utils
+
+if $eval_flag; then
+list_set="tr05_simu_noisy dt05_simu_noisy et05_simu_noisy"
+else
+list_set="tr05_simu_noisy dt05_simu_noisy"
+fi
+
+cd $dir
+
+find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_noisy.flist
+find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_noisy.flist
+if $eval_flag; then
+find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_noisy.flist
+fi
+
+# make a dot format from json annotation files
+cp $trans_dir/dt05_simu.dot_all dt05_simu.dot
+if $eval_flag; then
+cp $trans_dir/et05_simu.dot_all et05_simu.dot
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.id.temp
+    cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
+    cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
+    cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
+    paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
+done
+
+# make a transcription from dot
+# simulation training data extract dot file from original WSJ0 data
+# since it is generated from these data
+if [ ! -e dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
+  exit 1;
+fi
+cat tr05_simu_noisy_wav.scp.temp | awk -F'[_]' '{print $3}' | tr '[A-Z]' '[a-z]' \
+    | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt
+cat tr05_simu_noisy_wav.scp.temp | cut -f 1 -d" " > tr05_simu_noisy.ids
+paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -t_ -k1,1 -k3 > tr05_simu_noisy.trans1
+# dt05 and et05 simulation data are generated from the CHiME4 booth recording
+# and we use CHiME4 dot files
+cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> dt05_simu_noisy.ids
+cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_noisy.txt
+paste -d" " dt05_simu_noisy.ids dt05_simu_noisy.txt | \
+awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
+sort -k 1 > dt05_simu_noisy.trans1
+if $eval_flag; then
+cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> et05_simu_noisy.ids
+cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_noisy.txt
+paste -d" " et05_simu_noisy.ids et05_simu_noisy.txt | \
+awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
+sort -k 1 > et05_simu_noisy.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
+  cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
+  paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  sort ${x}_wav.scp.temp > ${x}_wav.scp
+  mkdir -p ../../chime4/$x
+  cp ${x}_wav.scp ../../chime4/$x/wav.scp || exit 1;
+  cp ${x}.txt     ../../chime4/$x/text    || exit 1;
+done
+
+# clean up temp files
+rm *.temp
+rm *.part{1,2}
+
+echo "Data preparation succeeded"
--- a/examples/chime4/s0/path.sh
+++ b/examples/chime4/s0/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/chime4/s0/run.sh
+++ b/examples/chime4/s0/run.sh
+#!/usr/bin/env bash
+
+# Copyright 2020 Jian Wu
+# License: Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -eu
+
+stage="1-4"
+space="<space>"
+track="isolated_1ch_track"
+wsj1_data_dir=//scratch/jwu/wsj1
+chime4_data_dir=/scratch/jwu/CHiME4
+dump_wav_dir=/scratch/jwu/chime4_wav
+
+data_dir=data/chime4
+dict=$data_dir/dict_char.txt
+train_config=conf/train_conformer.yaml
+exp_dir=exp/1a
+decode_modes="ctc_prefix_beam_search attention_rescoring"
+average_checkpoint=true
+average_num=10
+
+. ./path.sh
+. ./tools/parse_options.sh || exit 1
+
+beg=$(echo $stage | awk -F '-' '{print $1}')
+end=$(echo $stage | awk -F '-' '{print $2}')
+[ -z $end ] && end=$beg
+
+if [ $end -ge 1 ] && [ $beg -le 1 ]; then
+  echo "Stage 1: preparing data ..."
+  ./local/clean_wsj0_data_prep.sh $chime4_data_dir/CHiME3/data/WSJ0
+  ./local/simu_noisy_chime4_data_prep.sh $chime4_data_dir
+  ./local/real_noisy_chime4_data_prep.sh $chime4_data_dir
+  ./local/simu_enhan_chime4_data_prep.sh $track $chime4_data_dir/data/audio/16kHz/$track
+  ./local/real_enhan_chime4_data_prep.sh $track $chime4_data_dir/data/audio/16kHz/$track
+  ./local/clean_wsj1_data_prep.sh $wsj1_data_dir
+  ./local/chime4_format_dir.sh
+fi
+
+
+if [ $end -ge 2 ] && [ $beg -le 2 ]; then
+  echo -e "<NOISE>\n<*IN*>\n<*MR.*>" > $data_dir/train/non_lang.txt
+  for name in dev train; do
+    python tools/text2token.py $data_dir/$name/text -n 1 -s 1 \
+      -l $data_dir/train/non_lang.txt > $data_dir/$name/char
+  done
+  mkdir -p $(dirname $dict) && echo -e "<blank> 0\n<unk> 1" > ${dict}
+  cat $data_dir/train/char | cut -f 2- -d" " | tr " " "\n" | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict
+  echo "Make dictionary done"
+fi
+
+
+if [ $end -ge 3 ] && [ $beg -le 3 ]; then
+  ./local/chime4_gen_wav.sh $data_dir/train $dump_wav_dir
+  tools/compute_cmvn_stats.py --num_workers 16 \
+   --train_config $train_config \
+    --in_scp $data_dir/train/wav.scp \
+    --out_cmvn $data_dir/train/global_cmvn
+  echo "Prepare data, prepare required format"
+  for x in train dev; do
+  tools/make_raw_list.py $data_dir/$x/wav.scp $data_dir/$x/char \
+    $data_dir/$x/data.list
+  done
+fi
+
+if [ $end -ge 4 ] && [ $beg -le 4 ]; then
+  mkdir -p $exp_dir && cp $data_dir/train/global_cmvn $exp_dir
+  python wenet/bin/train.py \
+    --gpu 0 \
+    --config $train_config \
+    --train_data $data_dir/train/data.list \
+    --cv_data $data_dir/dev/data.list \
+    --model_dir $exp_dir \
+    --num_workers 4 \
+    --symbol_table $dict \
+    --cmvn $exp_dir/global_cmvn \
+    --pin_memory > $exp_dir/train.log 2>&1
+fi
+
+suffix="isolated_1ch_track"
+if [ $end -ge 5 ] && [ $beg -le 5 ]; then
+  if [ ${average_checkpoint} == true ]; then
+    decode_checkpoint=$exp_dir/avg_${average_num}.pt
+    echo "do model average and final checkpoint is $decode_checkpoint"
+    python wenet/bin/average_model.py \
+      --dst_model $decode_checkpoint \
+      --src_path $exp_dir  \
+      --num ${average_num} \
+      --val_best
+  fi
+  nj=4
+  ctc_weight=0.5
+  for x in dt05_{simu,real} et05_{simu,real}; do
+    subdir=${x}_${suffix}
+    tools/make_raw_list.py $data_dir/$subdir/wav.scp $data_dir/$subdir/text \
+      $data_dir/$subdir/data.list
+  done
+  for mode in ${decode_modes}; do
+    for x in dt05_{simu,real} et05_{simu,real}; do
+      subdir=${x}_${suffix}
+      dec_dir=$exp_dir/${subdir}_${mode} && mkdir -p $dec_dir
+      python wenet/bin/recognize.py \
+        --gpu 0 \
+        --mode $mode \
+        --config $exp_dir/train.yaml \
+        --test_data $data_dir/$subdir/data.list \
+        --checkpoint $exp_dir/avg_${average_num}.pt \
+        --beam_size 8 \
+        --batch_size 1 \
+        --dict $dict \
+        --ctc_weight $ctc_weight \
+        --result_file $dec_dir/text &
+     done
+     wait
+  done
+  for mode in ${decode_modes}; do
+    for x in dt05_{simu,real} et05_{simu,real}; do
+     subdir=${x}_${suffix}
+     dec_dir=$exp_dir/${subdir}_${mode}
+     sed 's:<space>: :g' $dec_dir/text > $dec_dir/text.norm
+     python tools/compute-wer.py --char=1 --v=1 \
+       $data_dir/$subdir/text $dec_dir/text.norm > $dec_dir/wer
+    done
+  done
+fi
+
--- a/examples/chime4/s0/tools
+++ b/examples/chime4/s0/tools
+../../../tools
\ No newline at end of file
--- a/examples/chime4/s0/wenet
+++ b/examples/chime4/s0/wenet
+../../../wenet
\ No newline at end of file
--- a/examples/commonvoice/fr/README.md
+++ b/examples/commonvoice/fr/README.md
+# Performance Record
+# Should be installed ffmpeg , pandas !!!
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs
+* Decoding info: average_num 20
+
+
+
+|     decoding mode      | test (wer) |
+| :--------------------: | :---------: |
+|   ctc_greedy_search    |   16.12%    |
+| ctc_prefix_beam_search |   16.07%    |
+|       attention        |   13.56%    |
+|  attention_rescoring   |   14.01%    |
\ No newline at end of file
--- a/examples/commonvoice/fr/conf/train_conformer.yaml
+++ b/examples/commonvoice/fr/conf/train_conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 40
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        batch_size: 8
+
+grad_clip: 10
+accum_grad: 4
+max_epoch: 30
+log_interval: 200
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
--- a/examples/commonvoice/fr/local/create_scp_text.py
+++ b/examples/commonvoice/fr/local/create_scp_text.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import re
+def process(src_str):
+    punc = '~`!#$%^&*()_+-=|\';":/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》《{}'
+    return re.sub(r"[{0}]+".format(punc), "", src_str).upper()
+
+if __name__ == '__main__':
+    src_dir = sys.argv[1]
+    tsv_file = src_dir + "/" + sys.argv[2] + ".tsv"
+    output_dir = sys.argv[3]
+    for file_path in os.listdir(src_dir + "/clips"):
+        if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")):
+            continue
+        t_str = src_dir + "/clips/" + file_path
+        tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav"
+        os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str))
+    import pandas
+    tsv_content = pandas.read_csv(tsv_file, sep="\t")
+    path_list = tsv_content["path"]
+    sentence = tsv_content["sentence"]
+    client_list = tsv_content["client_id"]
+    scp_file = open(output_dir + "/wav.scp", "w")
+    text_file = open(output_dir + "/text", "w")
+    utt2spk = open(output_dir + "/utt2spk", "w")
+    for i in range(len(path_list)):
+        temple_str = path_list[i].split(".")[0]
+        now_sentence = process(sentence[i])
+        wav_file = src_dir + "/wavs/" + temple_str + ".wav"
+        scp_file.writelines(temple_str + " " + wav_file + "\n")
+        text_file.writelines(temple_str + " " + now_sentence + "\n")
+        utt2spk.writelines(temple_str + " " + client_list[i] + "\n")
+    scp_file.close()
+    text_file.close()
+    utt2spk.close()
--- a/examples/commonvoice/fr/local/download_data.sh
+++ b/examples/commonvoice/fr/local/download_data.sh
+#!/usr/bin/env bash
+if [ $# -le 1 ]; then
+    echo "Args_Error:Two parameters are required."
+    exit 1;
+fi
+download_path=$1
+data_France=$2
+wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz
+tar -xvf ${download_path}/tmp.zip  -C ${data_France}
+rm -rf ${download_path}/tmp.zip
\ No newline at end of file
--- a/examples/commonvoice/fr/local/prepare_data.sh
+++ b/examples/commonvoice/fr/local/prepare_data.sh
+#!/usr/bin/env bash
+if [ $# -le 0 ]; then
+    echo "Argument should be France src directory, see ../run.sh for example."
+    exit 1;
+fi
+dir=`pwd`/data
+local=`pwd`/local
+src_path=$1
+if [ ! -d ${dir} ]; then
+    mkdir ${dir}
+  else
+    rm -rf ${dir}
+    mkdir ${dir}
+fi
+
+for x in train dev test; do
+    if [ ! ${dir}/${x} ]; then
+        mkdir ${dir}/${x}
+    else
+        rm -rf ${dir}/${x}
+        mkdir ${dir}/${x}
+    fi
+done
+
+if [ ! -d ${src_path}/wavs ]; then
+    mkdir ${src_path}/wavs
+fi
+for x in train dev test; do
+    python3 ${local}/create_scp_text.py  ${src_path} ${x} ${dir}/${x}
+done
--- a/examples/commonvoice/fr/path.sh
+++ b/examples/commonvoice/fr/path.sh
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH