detokenizer.py 2.03 KB
Newer Older
Jared Casper's avatar
Jared Casper committed
1
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
Mohammad's avatar
Mohammad committed
2
3
4

"""Detokenization."""

5
6
import re

Mohammad's avatar
Mohammad committed
7

8
def ptb_detokenizer(string):
Neel Kant's avatar
Neel Kant committed
9
10
11
12
13
14
15
16
    string = string.replace(" '", "'")
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" n't", "n't")
    string = string.replace(" N ", "1 ")
    string = string.replace("$ 1", "$1")
    string = string.replace("# 1", "#1")
    return string
17
18
19


def wikitext_detokenizer(string):
Neel Kant's avatar
Neel Kant committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")
49

Neel Kant's avatar
Neel Kant committed
50
    return string
51

Mohammad's avatar
Mohammad committed
52

53
def lambada_detokenizer(string):
Neel Kant's avatar
Neel Kant committed
54
    return string
55
56


Mohammad's avatar
Mohammad committed
57
_DETOKENIZERS = {
Neel Kant's avatar
Neel Kant committed
58
    'ptb': ptb_detokenizer,
Mostofa Patwary's avatar
Mostofa Patwary committed
59
    'wiki': wikitext_detokenizer,
Neel Kant's avatar
Neel Kant committed
60
    'lambada': lambada_detokenizer,
61
}
Mohammad's avatar
Mohammad committed
62
63
64


def get_detokenizer(path):
Neel Kant's avatar
Neel Kant committed
65
66
67
    for key in _DETOKENIZERS.keys():
        if key in path:
            return _DETOKENIZERS[key]