v1.0

72f5785f · huaerkl · 72f5785f · 72f5785f · 72f5785f · 72f5785f
Commit 72f5785f authored Aug 15, 2023 by huaerkl
8 changed files
--- a/examples/mms/asr/infer/mms_infer.py
+++ b/examples/mms/asr/infer/mms_infer.py
+#!/usr/bin/env python -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import soundfile as sf
+import tempfile
+from pathlib import Path
+import os
+import subprocess
+import sys
+import re
+
+def parser():
+    parser = argparse.ArgumentParser(description="ASR inference script for MMS model")
+    parser.add_argument("--model", type=str, help="path to ASR model", required=True)
+    parser.add_argument("--audio", type=str, help="path to audio file", required=True, nargs='+')
+    parser.add_argument("--lang", type=str, help="audio language", required=True)
+    parser.add_argument("--format", type=str, choices=["none", "letter"], default="letter")
+    parser.add_argument("--extra-infer-args", type=str, default="")
+    return parser.parse_args()
+
+def reorder_decode(hypos):
+    outputs = []
+    for hypo in hypos:
+        idx = int(re.findall("\(None-(\d+)\)$", hypo)[0])
+        hypo = re.sub("\(\S+\)$", "", hypo).strip()
+        outputs.append((idx, hypo))
+    outputs = sorted(outputs)
+    return outputs
+
+def process(args):    
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print(">>> preparing tmp manifest dir ...", file=sys.stderr)
+        tmpdir = Path(tmpdir)
+        with open(tmpdir / "dev.tsv", "w") as fw, open(tmpdir / "dev.uid", "w") as fu:
+            fw.write("/\n")
+            for audio in args.audio:
+                nsample = sf.SoundFile(audio).frames
+                fw.write(f"{audio}\t{nsample}\n")
+                fu.write(f"{audio}\n")
+        with open(tmpdir / "dev.ltr", "w") as fw:
+            fw.write("d u m m y | d u m m y |\n"*len(args.audio))
+        with open(tmpdir / "dev.wrd", "w") as fw:
+            fw.write("dummy dummy\n"*len(args.audio))
+        cmd = f"""
+        PYTHONPATH=. PREFIX=INFER HYDRA_FULL_ERROR=1 python examples/speech_recognition/new/infer.py -m --config-dir examples/mms/asr/config/ --config-name infer_common decoding.type=viterbi dataset.max_tokens=1440000 distributed_training.distributed_world_size=1 "common_eval.path='{args.model}'" task.data={tmpdir} dataset.gen_subset="{args.lang}:dev" common_eval.post_process={args.format} decoding.results_path={tmpdir} {args.extra_infer_args}
+        """
+        print(">>> loading model & running inference ...", file=sys.stderr)
+        subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL,)
+        with open(tmpdir/"hypo.word") as fr:
+            hypos = fr.readlines()
+            outputs = reorder_decode(hypos)
+            for ii, hypo in outputs:
+                hypo = re.sub("\(\S+\)$", "", hypo).strip()
+                print(f'===============\nInput: {args.audio[ii]}\nOutput: {hypo}')
+
+
+if __name__ == "__main__":
+    args = parser()
+    process(args)
--- a/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb
+++ b/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb
--- a/examples/mms/data_prep/README.md
+++ b/examples/mms/data_prep/README.md
+# Data Preparation 
+
+We describe the process of aligning long audio files with their transcripts and generating shorter audio segments below. 
+
+- Step 1:  Download and install torchaudio using the nightly version. We have open sourced the CTC forced alignment algorithm described in our paper via [torchaudio](https://github.com/pytorch/audio/pull/3348). 
+  ```
+  pip install --pre torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+  ```
+  
+- Step 2: Download [uroman](https://github.com/isi-nlp/uroman) from Github. It is a universal romanizer which converts text in any script to the Latin alphabet. Use [this link](https://www.isi.edu/~ulf/uroman.html) to try their web interface.  
+  ```
+  git clone git@github.com:isi-nlp/uroman.git
+  ```
+  
+- Step 3: Install a few other dependencies 
+  ```
+  pip install sox 
+  pip install dataclasses 
+  ```  
+
+- Step 4: Create a text file containing the transcript for a (long) audio file. Each line in the text file will correspond to a separate audio segment that will be generated upon alignment.
+
+  Example content of the input text file :
+  ```
+  Text of the desired first segment
+  Text of the desired second segment
+  Text of the desired third segment
+  ```
+
+- Step 5: Run forced alignment and segment the audio file into shorter segments. 
+  ```
+  python align_and_segment.py --audio /path/to/audio.wav --textfile /path/to/textfile --lang <iso> --outdir /path/to/output --uroman /path/to/uroman/bin 
+  ```
+
+  The above code  will generated the audio segments under output directory based on the content of each line in the input text file. The `manifest.json` file consisting of the of segmented audio filepaths and their corresponding transcripts. 
+
+  ```
+  > head /path/to/output/manifest.json 
+
+  {"audio_start_sec": 0.0, "audio_filepath": "/path/to/output/segment1.flac", "duration": 6.8, "text": "she wondered afterwards how she could have spoken with that hard serenity how she could have", "normalized_text": "she wondered afterwards how she could have spoken with that hard serenity how she could have", "uroman_tokens": "s h e w o n d e r e d a f t e r w a r d s h o w s h e c o u l d h a v e s p o k e n w i t h t h a t h a r d s e r e n i t y h o w s h e c o u l d h a v e"}
+  {"audio_start_sec": 6.8, "audio_filepath": "/path/to/output/segment2.flac", "duration": 5.3, "text": "gone steadily on with story after story poem after poem till", "normalized_text": "gone steadily on with story after story poem after poem till", "uroman_tokens": "g o n e s t e a d i l y o n w i t h s t o r y a f t e r s t o r y p o e m a f t e r p o e m t i l l"}
+  {"audio_start_sec": 12.1, "audio_filepath": "/path/to/output/segment3.flac", "duration": 5.9, "text": "allan's grip on her hands relaxed and he fell into a heavy tired sleep", "normalized_text": "allan's grip on her hands relaxed and he fell into a heavy tired sleep", "uroman_tokens": "a l l a n ' s g r i p o n h e r h a n d s r e l a x e d a n d h e f e l l i n t o a h e a v y t i r e d s l e e p"}
+  ```
+
+  To visualize the segmented audio files, [Speech Data Explorer](https://github.com/NVIDIA/NeMo/tree/main/tools/speech_data_explorer) tool from NeMo toolkit can be used.  
+
+  As our alignment model outputs uroman tokens for input audio in any language, it also works with non-english audio and their corresponding transcripts. 
--- a/examples/mms/data_prep/align_and_segment.py
+++ b/examples/mms/data_prep/align_and_segment.py
+import os
+import torch
+import torchaudio
+import sox
+import json
+import argparse
+
+
+from examples.mms.data_prep.text_normalization import text_normalize
+from examples.mms.data_prep.align_utils import (
+    get_uroman_tokens,
+    time_to_frame,
+    load_model_dict,
+    merge_repeats,
+    get_spans,
+)
+import torchaudio.functional as F
+
+SAMPLING_FREQ = 16000
+EMISSION_INTERVAL = 30
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def generate_emissions(model, audio_file):
+    waveform, _ = torchaudio.load(audio_file)  # waveform: channels X T
+    waveform = waveform.to(DEVICE)
+    total_duration = sox.file_info.duration(audio_file)
+
+    audio_sf = sox.file_info.sample_rate(audio_file)
+    assert audio_sf == SAMPLING_FREQ
+
+    emissions_arr = []
+    with torch.inference_mode():
+        i = 0
+        while i < total_duration:
+            segment_start_time, segment_end_time = (i, i + EMISSION_INTERVAL)
+
+            context = EMISSION_INTERVAL * 0.1
+            input_start_time = max(segment_start_time - context, 0)
+            input_end_time = min(segment_end_time + context, total_duration)
+            waveform_split = waveform[
+                :,
+                int(SAMPLING_FREQ * input_start_time) : int(
+                    SAMPLING_FREQ * (input_end_time)
+                ),
+            ]
+
+            model_outs, _ = model(waveform_split)
+            emissions_ = model_outs[0]
+            emission_start_frame = time_to_frame(segment_start_time)
+            emission_end_frame = time_to_frame(segment_end_time)
+            offset = time_to_frame(input_start_time)
+
+            emissions_ = emissions_[
+                emission_start_frame - offset : emission_end_frame - offset, :
+            ]
+            emissions_arr.append(emissions_)
+            i += EMISSION_INTERVAL
+
+    emissions = torch.cat(emissions_arr, dim=0).squeeze()
+    emissions = torch.log_softmax(emissions, dim=-1)
+
+    stride = float(waveform.size(1) * 1000 / emissions.size(0) / SAMPLING_FREQ)
+
+    return emissions, stride
+
+
+def get_alignments(
+    audio_file,
+    tokens,
+    model,
+    dictionary,
+    use_star,
+):
+    # Generate emissions
+    emissions, stride = generate_emissions(model, audio_file)
+    T, N = emissions.size()
+    if use_star:
+        emissions = torch.cat([emissions, torch.zeros(T, 1).to(DEVICE)], dim=1)
+
+    # Force Alignment
+    if tokens:
+        token_indices = [dictionary[c] for c in " ".join(tokens).split(" ") if c in dictionary]
+    else:
+        print(f"Empty transcript!!!!! for audio file {audio_file}")
+        token_indices = []
+
+    blank = dictionary["<blank>"]
+    
+    targets = torch.tensor(token_indices, dtype=torch.int32).to(DEVICE)
+    input_lengths = torch.tensor(emissions.shape[0])
+    target_lengths = torch.tensor(targets.shape[0])
+
+    path, _ = F.forced_align(
+        emissions, targets, input_lengths, target_lengths, blank=blank
+    )
+    path = path.to("cpu").tolist()
+    segments = merge_repeats(path, {v: k for k, v in dictionary.items()})
+    return segments, stride
+
+
+def main(args):
+    assert not os.path.exists(
+        args.outdir
+    ), f"Error: Output path exists already {args.outdir}"
+    
+    transcripts = []
+    with open(args.text_filepath) as f:
+        transcripts = [line.strip() for line in f]
+    print("Read {} lines from {}".format(len(transcripts), args.text_filepath))
+
+    norm_transcripts = [text_normalize(line.strip(), args.lang) for line in transcripts]
+    tokens = get_uroman_tokens(norm_transcripts, args.uroman_path, args.lang)
+
+    model, dictionary = load_model_dict()
+    model = model.to(DEVICE)
+    if args.use_star:
+        dictionary["<star>"] = len(dictionary)
+        tokens = ["<star>"] + tokens
+        transcripts = ["<star>"] + transcripts
+        norm_transcripts = ["<star>"] + norm_transcripts
+
+    segments, stride = get_alignments(
+        args.audio_filepath,
+        tokens,
+        model,
+        dictionary,
+        args.use_star,
+    )
+    # Get spans of each line in input text file
+    spans = get_spans(tokens, segments)
+
+    os.makedirs(args.outdir)
+    with open( f"{args.outdir}/manifest.json", "w") as f:
+        for i, t in enumerate(transcripts):
+            span = spans[i]
+            seg_start_idx = span[0].start
+            seg_end_idx = span[-1].end
+
+            output_file = f"{args.outdir}/segment{i}.flac"
+
+            audio_start_sec = seg_start_idx * stride / 1000
+            audio_end_sec = seg_end_idx * stride / 1000 
+
+            tfm = sox.Transformer()
+            tfm.trim(audio_start_sec , audio_end_sec)
+            tfm.build_file(args.audio_filepath, output_file)
+            
+            sample = {
+                "audio_start_sec": audio_start_sec,
+                "audio_filepath": str(output_file),
+                "duration": audio_end_sec - audio_start_sec,
+                "text": t,
+                "normalized_text":norm_transcripts[i],
+                "uroman_tokens": tokens[i],
+            }
+            f.write(json.dumps(sample) + "\n")
+
+    return segments, stride
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Align and segment long audio files")
+    parser.add_argument(
+        "-a", "--audio_filepath", type=str, help="Path to input audio file"
+    )
+    parser.add_argument(
+        "-t", "--text_filepath", type=str, help="Path to input text file "
+    )
+    parser.add_argument(
+        "-l", "--lang", type=str, default="eng", help="ISO code of the language"
+    )
+    parser.add_argument(
+        "-u", "--uroman_path", type=str, default="eng", help="Location to uroman/bin"
+    )
+    parser.add_argument(
+        "-s",
+        "--use_star",
+        action="store_true",
+        help="Use star at the start of transcript",
+    )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=str,
+        help="Output directory to store segmented audio files",
+    )
+    print("Using torch version:", torch.__version__)
+    print("Using torchaudio version:", torchaudio.__version__)
+    print("Using device: ", DEVICE)
+    args = parser.parse_args()
+    main(args)
--- a/examples/mms/data_prep/align_utils.py
+++ b/examples/mms/data_prep/align_utils.py
+import re 
+import os
+import torch
+import tempfile
+import math
+from dataclasses import dataclass
+from torchaudio.models import wav2vec2_model
+
+# iso codes with specialized rules in uroman
+special_isos_uroman = "ara, bel, bul, deu, ell, eng, fas, grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid".split(",")
+special_isos_uroman = [i.strip() for i in special_isos_uroman]
+
+def normalize_uroman(text):
+    text = text.lower()
+    text = re.sub("([^a-z' ])", " ", text)
+    text = re.sub(' +', ' ', text)
+    return text.strip()
+
+
+def get_uroman_tokens(norm_transcripts, uroman_root_dir, iso = None):
+    tf = tempfile.NamedTemporaryFile()  
+    tf2 = tempfile.NamedTemporaryFile()  
+    with open(tf.name, "w") as f:
+        for t in norm_transcripts:
+            f.write(t + "\n")
+
+    assert os.path.exists(f"{uroman_root_dir}/uroman.pl"), "uroman not found"
+    cmd = f"perl {uroman_root_dir}/uroman.pl"
+    if iso in special_isos_uroman:
+        cmd += f" -l {iso} "
+    cmd +=  f" < {tf.name} > {tf2.name}" 
+    os.system(cmd)
+    outtexts = []
+    with open(tf2.name) as f:
+        for line in f:
+            line = " ".join(line.strip())
+            line =  re.sub(r"\s+", " ", line).strip()
+            outtexts.append(line)
+    assert len(outtexts) == len(norm_transcripts)
+    uromans = []
+    for ot in outtexts:
+        uromans.append(normalize_uroman(ot))
+    return uromans
+
+
+
+@dataclass
+class Segment:
+    label: str
+    start: int
+    end: int
+
+    def __repr__(self):
+        return f"{self.label}: [{self.start:5d}, {self.end:5d})"
+
+    @property
+    def length(self):
+        return self.end - self.start
+
+
+def merge_repeats(path, idx_to_token_map):
+    i1, i2 = 0, 0
+    segments = []
+    while i1 < len(path):
+        while i2 < len(path) and path[i1] == path[i2]:
+            i2 += 1
+        segments.append(Segment(idx_to_token_map[path[i1]], i1, i2 - 1))
+        i1 = i2
+    return segments
+
+
+def time_to_frame(time):
+    stride_msec = 20
+    frames_per_sec = 1000 / stride_msec
+    return int(time * frames_per_sec)
+
+
+
+def load_model_dict():
+    model_path_name = "/tmp/ctc_alignment_mling_uroman_model.pt"
+
+    print("Downloading model and dictionary...")
+    if os.path.exists(model_path_name):
+        print("Model path already exists. Skipping downloading....")
+    else:
+        torch.hub.download_url_to_file(
+            "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt",
+            model_path_name,
+        )
+        assert os.path.exists(model_path_name)
+    state_dict = torch.load(model_path_name, map_location="cpu")
+
+    model = wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=[
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        extractor_conv_bias=True,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=0.0,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=0.0,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=0.1,
+        encoder_dropout=0.0,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=0.1,
+        aux_num_out=31,
+    )
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    dict_path_name = "/tmp/ctc_alignment_mling_uroman_model.dict"
+    if os.path.exists(dict_path_name):
+        print("Dictionary path already exists. Skipping downloading....")
+    else:
+        torch.hub.download_url_to_file(
+            "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/dictionary.txt",
+            dict_path_name,
+        )
+        assert os.path.exists(dict_path_name)
+    dictionary = {}
+    with open(dict_path_name) as f:
+        dictionary = {l.strip(): i for i, l in enumerate(f.readlines())}
+
+    return model, dictionary
+
+def get_spans(tokens, segments):
+    ltr_idx = 0
+    tokens_idx = 0
+    intervals = []
+    start, end = (0, 0)
+    sil = "<blank>"
+    for (seg_idx, seg) in enumerate(segments):
+        if(tokens_idx == len(tokens)):
+           assert(seg_idx == len(segments) - 1)
+           assert(seg.label == '<blank>')
+           continue
+        cur_token = tokens[tokens_idx].split(' ')
+        ltr = cur_token[ltr_idx]
+        if seg.label == "<blank>": continue
+        assert(seg.label == ltr)
+        if(ltr_idx) == 0: start = seg_idx
+        if ltr_idx == len(cur_token) - 1:
+            ltr_idx = 0
+            tokens_idx += 1
+            intervals.append((start, seg_idx))
+            while tokens_idx < len(tokens) and len(tokens[tokens_idx]) == 0:
+                    intervals.append((seg_idx, seg_idx))
+                    tokens_idx += 1
+        else:
+            ltr_idx += 1
+    spans = []
+    for (idx, (start, end)) in enumerate(intervals):
+        span = segments[start:end + 1]
+        if start > 0:
+            prev_seg = segments[start - 1]
+            if prev_seg.label == sil:
+                pad_start = prev_seg.start if (idx == 0) else int((prev_seg.start + prev_seg.end)/2)
+                span = [Segment(sil, pad_start, span[0].start)] + span
+        if end+1 < len(segments):
+            next_seg = segments[end+1]
+            if next_seg.label == sil:
+                pad_end = next_seg.end if (idx == len(intervals) - 1) else math.floor((next_seg.start + next_seg.end) / 2)
+                span = span + [Segment(sil, span[-1].end, pad_end)]
+        spans.append(span)
+    return spans
--- a/examples/mms/data_prep/norm_config.py
+++ b/examples/mms/data_prep/norm_config.py
+import os
+import re
+
+
+colon = ":"
+comma = ","
+exclamation_mark = "!"
+period = re.escape(".")
+question_mark = re.escape("?")
+semicolon = ";"
+
+left_curly_bracket = "{"
+right_curly_bracket = "}"
+quotation_mark = '"'
+
+basic_punc = (
+    period
+    + question_mark
+    + comma
+    + colon
+    + exclamation_mark
+    + left_curly_bracket
+    + right_curly_bracket
+)
+
+# General punc unicode block (0x2000-0x206F)
+zero_width_space = r"\u200B"
+zero_width_nonjoiner = r"\u200C"
+left_to_right_mark = r"\u200E"
+right_to_left_mark = r"\u200F"
+left_to_right_embedding = r"\u202A"
+pop_directional_formatting = r"\u202C"
+
+# Here are some commonly ill-typed versions of apostrophe
+right_single_quotation_mark = r"\u2019"
+left_single_quotation_mark = r"\u2018"
+
+# Language specific definitions
+# Spanish
+inverted_exclamation_mark = r"\u00A1"
+inverted_question_mark = r"\u00BF"
+
+
+# Hindi
+hindi_danda = u"\u0964"
+
+# Egyptian Arabic
+# arabic_percent = r"\u066A"
+arabic_comma = r"\u060C"
+arabic_question_mark = r"\u061F"
+arabic_semicolon = r"\u061B"
+arabic_diacritics = r"\u064B-\u0652"
+
+
+arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
+
+
+# Chinese
+full_stop = r"\u3002"
+full_comma = r"\uFF0C"
+full_exclamation_mark = r"\uFF01"
+full_question_mark = r"\uFF1F"
+full_semicolon = r"\uFF1B"
+full_colon = r"\uFF1A"
+full_parentheses = r"\uFF08\uFF09"
+quotation_mark_horizontal = r"\u300C-\u300F"
+quotation_mark_vertical = r"\uFF41-\uFF44"
+title_marks = r"\u3008-\u300B"
+wavy_low_line = r"\uFE4F"
+ellipsis = r"\u22EF"
+enumeration_comma = r"\u3001"
+hyphenation_point = r"\u2027"
+forward_slash = r"\uFF0F"
+wavy_dash = r"\uFF5E"
+box_drawings_light_horizontal = r"\u2500"
+fullwidth_low_line = r"\uFF3F"
+chinese_punc = (
+    full_stop
+    + full_comma
+    + full_exclamation_mark
+    + full_question_mark
+    + full_semicolon
+    + full_colon
+    + full_parentheses
+    + quotation_mark_horizontal
+    + quotation_mark_vertical
+    + title_marks
+    + wavy_low_line
+    + ellipsis
+    + enumeration_comma
+    + hyphenation_point
+    + forward_slash
+    + wavy_dash
+    + box_drawings_light_horizontal
+    + fullwidth_low_line
+)
+
+# Armenian
+armenian_apostrophe = r"\u055A"
+emphasis_mark = r"\u055B"
+exclamation_mark = r"\u055C"
+armenian_comma = r"\u055D"
+armenian_question_mark = r"\u055E"
+abbreviation_mark = r"\u055F"
+armenian_full_stop = r"\u0589"
+armenian_punc = (
+    armenian_apostrophe
+    + emphasis_mark
+    + exclamation_mark
+    + armenian_comma
+    + armenian_question_mark
+    + abbreviation_mark
+    + armenian_full_stop
+)
+
+lesser_than_symbol = r"&lt;"
+greater_than_symbol = r"&gt;"
+
+lesser_than_sign = r"\u003c"
+greater_than_sign = r"\u003e"
+
+nbsp_written_form = r"&nbsp"
+
+# Quotation marks
+left_double_quotes = r"\u201c"
+right_double_quotes = r"\u201d"
+left_double_angle = r"\u00ab"
+right_double_angle = r"\u00bb"
+left_single_angle = r"\u2039"
+right_single_angle = r"\u203a"
+low_double_quotes = r"\u201e"
+low_single_quotes = r"\u201a"
+high_double_quotes = r"\u201f"
+high_single_quotes = r"\u201b"
+
+all_punct_quotes = (
+    left_double_quotes
+    + right_double_quotes
+    + left_double_angle
+    + right_double_angle
+    + left_single_angle
+    + right_single_angle
+    + low_double_quotes
+    + low_single_quotes
+    + high_double_quotes
+    + high_single_quotes
+    + right_single_quotation_mark
+    + left_single_quotation_mark
+)
+mapping_quotes = (
+    "["
+    + high_single_quotes
+    + right_single_quotation_mark
+    + left_single_quotation_mark
+    + "]"
+)
+
+
+# Digits
+
+english_digits = r"\u0030-\u0039"
+bengali_digits = r"\u09e6-\u09ef"
+khmer_digits = r"\u17e0-\u17e9"
+devanagari_digits = r"\u0966-\u096f"
+oriya_digits = r"\u0b66-\u0b6f"
+extended_arabic_indic_digits = r"\u06f0-\u06f9"
+kayah_li_digits = r"\ua900-\ua909"
+fullwidth_digits = r"\uff10-\uff19"
+malayam_digits = r"\u0d66-\u0d6f"
+myanmar_digits = r"\u1040-\u1049"
+roman_numeral = r"\u2170-\u2179"
+nominal_digit_shapes = r"\u206f"
+
+# Load punctuations from MMS-lab data
+with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
+    punc_list = punc_f.readlines()
+
+punct_pattern = r""    
+for punc in punc_list:
+    # the first character in the tab separated line is the punc to be removed
+    punct_pattern += re.escape(punc.split("\t")[0])
+
+shared_digits = (
+    english_digits
+    + bengali_digits
+    + khmer_digits
+    + devanagari_digits
+    + oriya_digits
+    + extended_arabic_indic_digits
+    + kayah_li_digits
+    + fullwidth_digits
+    + malayam_digits
+    + myanmar_digits
+    + roman_numeral
+    + nominal_digit_shapes
+)
+
+shared_punc_list = (
+    basic_punc
+    + all_punct_quotes
+    + greater_than_sign
+    + lesser_than_sign
+    + inverted_question_mark
+    + full_stop
+    + semicolon
+    + armenian_punc
+    + inverted_exclamation_mark
+    + arabic_comma
+    + enumeration_comma
+    + hindi_danda
+    + quotation_mark
+    + arabic_semicolon
+    + arabic_question_mark
+    + chinese_punc
+    + punct_pattern
+
+)
+
+shared_mappping = {
+    lesser_than_symbol: "",
+    greater_than_symbol: "",
+    nbsp_written_form: "",
+    r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
+}
+
+shared_deletion_list = (
+    left_to_right_mark
+    + zero_width_nonjoiner
+    + arabic_subscript_alef_and_inverted_damma
+    + zero_width_space
+    + arabic_diacritics
+    + pop_directional_formatting
+    + right_to_left_mark
+    + left_to_right_embedding
+)
+
+norm_config = {
+    "*": {
+        "lower_case": True,
+        "punc_set": shared_punc_list,
+        "del_set": shared_deletion_list,
+        "mapping": shared_mappping,
+        "digit_set": shared_digits,
+        "unicode_norm": "NFKC",
+        "rm_diacritics" : False,
+    }
+}
+
+#=============== Mongolian ===============#
+
+norm_config["mon"] = norm_config["*"].copy()
+# add soft hyphen to punc list to match with fleurs
+norm_config["mon"]["del_set"] += r"\u00AD"
+
+norm_config["khk"] = norm_config["mon"].copy()
+
+#=============== Hebrew ===============#
+
+norm_config["heb"] = norm_config["*"].copy()
+# add "HEBREW POINT" symbols to match with fleurs
+norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
+
+#=============== Thai ===============#
+
+norm_config["tha"] = norm_config["*"].copy()
+# add "Zero width joiner" symbols to match with fleurs
+norm_config["tha"]["punc_set"] += r"\u200D"
+
+#=============== Arabic ===============#
+norm_config["ara"] = norm_config["*"].copy()
+norm_config["ara"]["mapping"]["ٱ"] = "ا"
+norm_config["arb"] = norm_config["ara"].copy()
+
+#=============== Javanese ===============#
+norm_config["jav"] = norm_config["*"].copy()
+norm_config["jav"]["rm_diacritics"] = True
+
--- a/examples/mms/data_prep/punctuations.lst
+++ b/examples/mms/data_prep/punctuations.lst
+	7355	INVALID UNICODE	0x81
+	5265	INVALID UNICODE	0x90
+	75	INVALID UNICODE	0x8
+	31	INVALID UNICODE	0x8d
+	3	INVALID UNICODE	0x94
+	2	INVALID UNICODE	0x8f
+	2	INVALID UNICODE	0x1a
+	1	INVALID UNICODE	0x9d
+	1	INVALID UNICODE	0x93
+	1	INVALID UNICODE	0x92
+	8647	INVALID UNICODE	0xe295
+	6650	INVALID UNICODE	0xf21d
+	6234	INVALID UNICODE	0xf62d
+	4815	INVALID UNICODE	0xf173
+	4789	INVALID UNICODE	0xe514
+	4409	INVALID UNICODE	0xe293
+	3881	INVALID UNICODE	0xf523
+	3788	INVALID UNICODE	0xe233
+	2448	INVALID UNICODE	0xf50f
+	2177	INVALID UNICODE	0xe232
+	1955	INVALID UNICODE	0xea7b
+	1926	INVALID UNICODE	0xf172
+	973	INVALID UNICODE	0xe290
+	972	INVALID UNICODE	0xf519
+	661	INVALID UNICODE	0xe292
+	591	INVALID UNICODE	0xe328
+	509	INVALID UNICODE	0xe2fa
+	458	INVALID UNICODE	0xe234
+	446	INVALID UNICODE	0xe043
+	419	INVALID UNICODE	0xe040
+	399	INVALID UNICODE	0xe2fb
+	387	INVALID UNICODE	0xe32b
+	381	INVALID UNICODE	0xe236
+	374	INVALID UNICODE	0xf511
+	314	INVALID UNICODE	0xe517
+	296	INVALID UNICODE	0xe2fe
+	293	INVALID UNICODE	0xe492
+	291	INVALID UNICODE	0xf52d
+	289	INVALID UNICODE	0xe2fc
+	195	INVALID UNICODE	0xf521
+	190	INVALID UNICODE	0xe516
+	182	INVALID UNICODE	0xe041
+	178	INVALID UNICODE	0xf529
+	113	INVALID UNICODE	0xe2f9
+	87	INVALID UNICODE	0xe2d9
+	78	INVALID UNICODE	0xe32a
+	76	INVALID UNICODE	0xe291
+	74	INVALID UNICODE	0xe296
+	66	INVALID UNICODE	0xe518
+	52	INVALID UNICODE	0xe32c
+	46	INVALID UNICODE	0xe2db
+	41	INVALID UNICODE	0xe231
+	34	INVALID UNICODE	0xf522
+	33	INVALID UNICODE	0xf518
+	32	INVALID UNICODE	0xf513
+	27	INVALID UNICODE	0xe32d
+	25	INVALID UNICODE	0xe32e
+	23	INVALID UNICODE	0xe06b
+	15	INVALID UNICODE	0xea01
+	12	INVALID UNICODE	0xe294
+	11	INVALID UNICODE	0xe203
+	8	INVALID UNICODE	0xf218
+	7	INVALID UNICODE	0xe070
+	7	INVALID UNICODE	0xe013
+	5	INVALID UNICODE	0xe2de
+	4	INVALID UNICODE	0xe493
+	3	INVALID UNICODE	0xf7e8
+	3	INVALID UNICODE	0xf7d0
+	3	INVALID UNICODE	0xe313
+	2	INVALID UNICODE	0xe329
+	2	INVALID UNICODE	0xe06d
+	2	INVALID UNICODE	0xe003
+	1	INVALID UNICODE	0xf50e
+	1	INVALID UNICODE	0xf171
+	1	INVALID UNICODE	0xe01d
+⁯	71	NOMINAL DIGIT SHAPES	0x206f
+⁠	3	WORD JOINER	0x2060
+―	126545	HORIZONTAL BAR	0x2015
+־	1028	HEBREW PUNCTUATION MAQAF	0x5be
+)	98429	RIGHT PARENTHESIS	0x29
+]	27108	RIGHT SQUARE BRACKET	0x5d
+⌋	1567	RIGHT FLOOR	0x230b
+〕	97	RIGHT TORTOISE SHELL BRACKET	0x3015
+】	36	RIGHT BLACK LENTICULAR BRACKET	0x3011
+﴾	14	ORNATE LEFT PARENTHESIS	0xfd3e
+&	170517	AMPERSAND	0x26
+།	106330	TIBETAN MARK SHAD	0xf0d
+።	90203	ETHIOPIC FULL STOP	0x1362
+፥	60484	ETHIOPIC COLON	0x1365
+༌	60464	TIBETAN MARK DELIMITER TSHEG BSTAR	0xf0c
+။	51567	MYANMAR SIGN SECTION	0x104b
+/	46929	SOLIDUS	0x2f
+၊	38042	MYANMAR SIGN LITTLE SECTION	0x104a
+·	37985	MIDDLE DOT	0xb7
+‸	36310	CARET	0x2038
+*	34793	ASTERISK	0x2a
+۔	32432	ARABIC FULL STOP	0x6d4
+፤	31906	ETHIOPIC SEMICOLON	0x1364
+၏	21519	MYANMAR SYMBOL GENITIVE	0x104f
+។	20834	KHMER SIGN KHAN	0x17d4
+꓾	15773	LISU PUNCTUATION COMMA	0xa4fe
+᙮	13473	CANADIAN SYLLABICS FULL STOP	0x166e
+꤯	12892	KAYAH LI SIGN SHYA	0xa92f
+⵰	11478	TIFINAGH SEPARATOR MARK	0x2d70
+꓿	11118	LISU PUNCTUATION FULL STOP	0xa4ff
+॥	10763	DEVANAGARI DOUBLE DANDA	0x965
+؞	10403	ARABIC TRIPLE DOT PUNCTUATION MARK	0x61e
+၍	8936	MYANMAR SYMBOL COMPLETED	0x104d
+·	8431	GREEK ANO TELEIA	0x387
+†	7477	DAGGER	0x2020
+၌	6632	MYANMAR SYMBOL LOCATIVE	0x104c
+፣	5719	ETHIOPIC COMMA	0x1363
+៖	5528	KHMER SIGN CAMNUC PII KUUH	0x17d6
+꤮	4791	KAYAH LI SIGN CWI	0xa92e
+※	3439	REFERENCE MARK	0x203b
+፦	2727	ETHIOPIC PREFACE COLON	0x1366
+•	1749	BULLET	0x2022
+¶	1507	PILCROW SIGN	0xb6
+၎	1386	MYANMAR SYMBOL AFOREMENTIONED	0x104e
+﹖	1224	SMALL QUESTION MARK	0xfe56
+;	975	GREEK QUESTION MARK	0x37e
+…	827	HORIZONTAL ELLIPSIS	0x2026
+%	617	PERCENT SIGN	0x25
+・	468	KATAKANA MIDDLE DOT	0x30fb
+༎	306	TIBETAN MARK NYIS SHAD	0xf0e
+‡	140	DOUBLE DAGGER	0x2021
+#	137	NUMBER SIGN	0x23
+@	125	COMMERCIAL AT	0x40
+፡	121	ETHIOPIC WORDSPACE	0x1361
+៚	55	KHMER SIGN KOOMUUT	0x17da
+៕	49	KHMER SIGN BARIYOOSAN	0x17d5
+﹐	10	SMALL COMMA	0xfe50
+༅	6	TIBETAN MARK CLOSING YIG MGO SGAB MA	0xf05
+༄	6	TIBETAN MARK INITIAL YIG MGO MDUN MA	0xf04
+．	2	FULLWIDTH FULL STOP	0xff0e
+﹗	2	SMALL EXCLAMATION MARK	0xfe57
+﹕	2	SMALL COLON	0xfe55
+‰	2	PER MILLE SIGN	0x2030
+･	1	HALFWIDTH KATAKANA MIDDLE DOT	0xff65
+(	98504	LEFT PARENTHESIS	0x28
+[	27245	LEFT SQUARE BRACKET	0x5b
+⌊	1567	LEFT FLOOR	0x230a
+〔	95	LEFT TORTOISE SHELL BRACKET	0x3014
+【	36	LEFT BLACK LENTICULAR BRACKET	0x3010
+﴿	14	ORNATE RIGHT PARENTHESIS	0xfd3f
+_	4851	LOW LINE	0x5f
+$	72	DOLLAR SIGN	0x24
+€	14	EURO SIGN	0x20ac
+£	2	POUND SIGN	0xa3
+~	27462	TILDE	0x7e
+=	11450	EQUALS SIGN	0x3d
+|	8430	VERTICAL LINE	0x7c
+−	3971	MINUS SIGN	0x2212
+≫	1904	MUCH GREATER-THAN	0x226b
+≪	1903	MUCH LESS-THAN	0x226a
+	1450	PLUS SIGN	0x2b
+＜	345	FULLWIDTH LESS-THAN SIGN	0xff1c
+＞	344	FULLWIDTH GREATER-THAN SIGN	0xff1e
+¬	5	NOT SIGN	0xac
+×	4	MULTIPLICATION SIGN	0xd7
+→	2	RIGHTWARDS ARROW	0x2192
+᙭	537	CANADIAN SYLLABICS CHI SIGN	0x166d
+°	499	DEGREE SIGN	0xb0
+႟	421	MYANMAR SYMBOL SHAN EXCLAMATION	0x109f
+�	192	REPLACEMENT CHARACTER	0xfffd
+⌟	54	BOTTOM RIGHT CORNER	0x231f
+⌞	54	BOTTOM LEFT CORNER	0x231e
+©	2	COPYRIGHT SIGN	0xa9
+ 	40	NARROW NO-BREAK SPACE	0x202f
+ 	1	SIX-PER-EM SPACE	0x2006
+˜	40261	SMALL TILDE	0x2dc
+^	6469	CIRCUMFLEX ACCENT	0x5e
+¯	20	MACRON	0xaf
+ˇ	191442	CARON	0x2c7
+ⁿ	38144	SUPERSCRIPT LATIN SMALL LETTER N	0x207f
+ـ	9440	ARABIC TATWEEL	0x640
+ๆ	6766	THAI CHARACTER MAIYAMOK	0xe46
+ៗ	3310	KHMER SIGN LEK TOO	0x17d7
+々	678	IDEOGRAPHIC ITERATION MARK	0x3005
+ໆ	430	LAO KO LA	0xec6
+ー	319	KATAKANA-HIRAGANA PROLONGED SOUND MARK	0x30fc
+ⁱ	137	SUPERSCRIPT LATIN SMALL LETTER I	0x2071
+৷	11056	BENGALI CURRENCY NUMERATOR FOUR	0x9f7
+⅓	26	VULGAR FRACTION ONE THIRD	0x2153
+½	26	VULGAR FRACTION ONE HALF	0xbd
+¼	4	VULGAR FRACTION ONE QUARTER	0xbc
+⅟	1	FRACTION NUMERATOR ONE	0x215f
+⁄	57	FRACTION SLASH	0x2044
--- a/examples/mms/data_prep/text_normalization.py
+++ b/examples/mms/data_prep/text_normalization.py
+import json
+import re
+import unicodedata
+
+from examples.mms.data_prep.norm_config import norm_config
+
+
+def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
+
+    """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
+
+    Args:
+        text : The string to be normalized
+        iso_code :
+        remove_numbers : Boolean flag to specify if words containing only digits should be removed
+
+    Returns:
+        normalized_text : the string after all normalization  
+
+    """
+
+    config = norm_config.get(iso_code, norm_config["*"])
+
+    for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]:
+        if field not in config:
+            config[field] = norm_config["*"][field]
+
+
+    text = unicodedata.normalize(config["unicode_norm"], text)
+
+    # Convert to lower case
+
+    if config["lower_case"] and lower_case:
+        text = text.lower()
+
+    # brackets
+    
+    # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
+    text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
+    if remove_brackets:
+        text = re.sub(r"\([^\)]*\)", " ", text)
+
+    # Apply mappings
+
+    for old, new in config["mapping"].items():
+        text = re.sub(old, new, text)
+
+    # Replace punctutations with space
+
+    punct_pattern = r"[" + config["punc_set"]
+
+    punct_pattern += "]"
+
+    normalized_text = re.sub(punct_pattern, " ", text)
+
+    # remove characters in delete list
+
+    delete_patten = r"[" + config["del_set"] + "]"
+
+    normalized_text = re.sub(delete_patten, "", normalized_text)
+
+    # Remove words containing only digits
+    # We check for 3 cases  a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
+    # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
+    # The lookaround enables overlapping pattern matches to be replaced
+
+    if remove_numbers:
+
+        digits_pattern = "[" + config["digit_set"]
+
+        digits_pattern += "]+"
+
+        complete_digit_pattern = (
+            r"^"
+            + digits_pattern
+            + "(?=\s)|(?<=\s)"
+            + digits_pattern
+            + "(?=\s)|(?<=\s)"
+            + digits_pattern
+            + "$"
+        )
+
+        normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
+
+    if config["rm_diacritics"]:
+        from unidecode import unidecode
+        normalized_text = unidecode(normalized_text)
+
+    # Remove extra spaces
+    normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
+
+    return normalized_text