Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
ratus 1
seratus 1
dua ratus 2
tiga ratus 3
empat ratus 4
lima ratus 5
enam ratus 6
tujuh ratus 7
delapan ratus 8
sembilan ratus 9
\ No newline at end of file
sepuluh 10
sebelas 11
duabelas 12
dua belas 12
tigabelas 13
tiga belas 13
empatbelas 14
empat belas 14
limabelas 15
lima belas 15
enambelas 16
enam belas 16
tujuh belas 17
delapan belas 18
sembilan belas 19
\ No newline at end of file
ribu 1
seribu 1
dua ribu 2
tiga ribu 3
empat ribu 4
lima ribu 5
enam ribu 6
tujuh ribu 7
delapan ribu 8
sembilan ribu 9
\ No newline at end of file
ribu
seribu
juta
miliar
triliun
milion lipat empat
triliun
sextillion
septillion
oktillion
nonmiliar
satu juta
undecillion
duodecillion
triliun
quattuordecillion
quindecillion
sexdecillion
septendeciliun
octodecillion
novemdecillion
vigintillion
centillion
\ No newline at end of file
dua puluh 2
tigapuluh 3
empat puluh 4
empat puluh 4
lima puluh 5
enam puluh 6
tujuh puluh 7
delapan puluh 8
sembilan puluh 9
\ No newline at end of file
pertama satu
kedua dua
ketiga tiga
keempat empat
kelima lima
keenam enam
ketujuh tujuh
kedelapan delapan
kesembilan sembilan
\ No newline at end of file
deer
fish
sheep
foot feet
goose geese
man men
mouse mice
tooth teeth
woman women
won
child children
ox oxen
wife wives
wolf wolves
analysis analyses
criterion criteria
lbs
focus foci
percent
hertz
kroner krone
inch inches
calory calories
yen
megahertz
gigahertz
kilohertz
hertz
CC
c c
horsepower
hundredweight
kilogram force kilograms force
mega siemens
revolution per minute revolutions per minute
mile per hour miles per hour
megabit per second megabits per second
square foot square feet
kilobit per second kilobits per second
degree Celsius degrees Celsius
degree Fahrenheit degrees Fahrenheit
ATM
AU
BQ
CC
CD
DA
EB
EV
F
GB
G
GL
GPA
GY
HA
H
HL
GP
HS
KB
KL
KN
KT
KV
LM
MA
MA
MB
MC
MF
M
MM
MS
MV
MW
PB
PG
PS
S
TB
YB
ZB
\ No newline at end of file
1 59
2 58
3 57
4 56
5 55
6 54
7 53
8 52
9 51
10 50
11 49
12 48
13 47
14 46
15 45
16 44
17 43
18 42
19 41
20 40
21 39
22 38
23 37
24 36
25 35
26 34
27 33
28 32
29 31
30 30
31 29
32 28
33 27
34 26
35 25
36 24
37 23
38 22
39 21
40 20
41 19
42 18
43 17
44 16
45 15
46 14
47 13
48 12
49 11
50 10
51 9
52 8
53 7
54 6
55 5
56 4
57 3
58 2
59 1
p m p.m.
pm p.m.
p.m.
p.m p.m.
am a.m.
a.m.
a.m a.m.
a m a.m.
\ No newline at end of file
cst c s t
cet c e t
pst p s t
est e s t
pt p t
et e t
gmt g m t
satu 1
dua 2
tiga 3
empat 4
lima 5
enam 6
tujuh 7
delapan 8
sembilan 9
sepuluh 10
sebelas 11
dua belas 12
satu 13
dua 14
tiga 15
empat 16
lima 17
enam 18
tujuh 19
delapan 20
sembilan 21
sepuluh 22
sebelas 23
dua belas 24
\ No newline at end of file
e.g. misalnya
dr. dokter
mr. tuan
mrs. rindu
st. santo
7-eleven tujuhsebelas
es3 e s tiga
s&p s dan p
ASAP a s a p
AT&T a t dan t
LLP l l p
ATM a t m
import os
import string
from pathlib import Path
from typing import Dict
import pynini
from fun_text_processing.inverse_text_normalization.id.utils import get_abs_path
from pynini import Far
from pynini.examples import plurals
from pynini.export import export
from pynini.lib import byte, pynutil, utf8
DAMO_CHAR = utf8.VALID_UTF8_CHAR
DAMO_DIGIT = byte.DIGIT
DAMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
DAMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
DAMO_HEX = pynini.union(*string.hexdigits).optimize()
DAMO_NON_BREAKING_SPACE = "\u00A0"
DAMO_SPACE = " "
DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
DAMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
DAMO_GRAPH = pynini.union(DAMO_ALNUM, DAMO_PUNCT).optimize()
DAMO_SIGMA = pynini.closure(DAMO_CHAR)
delete_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE))
delete_zero_or_one_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE, 0, 1))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(' field_order: "') + DAMO_NOT_QUOTE + pynutil.delete('"'))
)
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b",
"c",
"d",
"f",
"g",
"h",
"j",
"k",
"l",
"m",
"n",
"p",
"q",
"r",
"s",
"t",
"v",
"w",
"x",
"y",
"z",
)
_ies = DAMO_SIGMA + _c + pynini.cross("y", "ies")
_es = DAMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = DAMO_SIGMA + pynutil.insert("s")
graph_plural = plurals._priority_union(
suppletive,
plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA),
DAMO_SIGMA,
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
TO_LOWER = pynini.union(
*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
)
TO_UPPER = pynini.invert(TO_LOWER)
MIN_NEG_WEIGHT = -0.0001
MIN_POS_WEIGHT = 0.0001
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
Args:
file_name: exported file name
graphs: Mapping of a rule name and Pynini WFST graph to be exported
"""
exporter = export.Exporter(file_name)
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
print(f"Created {file_name}")
def get_plurals(fst):
"""
Given singular returns plurals
Args:
fst: Fst
Returns plurals to given singular forms
"""
return SINGULAR_TO_PLURAL @ fst
def get_singulars(fst):
"""
Given plural returns singulars
Args:
fst: Fst
Returns singulars to given plural forms
"""
return PLURAL_TO_SINGULAR @ fst
def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
Args:
fst: input fst
Returns output fst where breaking spaces are converted to non breaking spaces
"""
return fst @ pynini.cdrewrite(
pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
)
class GraphFst:
"""
Base class for all grammar fsts.
Args:
name: name of grammar class
kind: either 'classify' or 'verbalize'
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, name: str, kind: str, deterministic: bool = True):
self.name = name
self.kind = kind
self._fst = None
self.deterministic = deterministic
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
self._fst = Far(
self.far_path, mode="r", arc_type="standard", far_type="default"
).get_fst()
def far_exist(self) -> bool:
"""
Returns true if FAR can be loaded
"""
return self.far_path.exists()
@property
def fst(self) -> "pynini.FstLike":
return self._fst
@fst.setter
def fst(self, fst):
self._fst = fst
def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
res = (
pynutil.delete(f"{self.name}")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ fst
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
2022
300
9999
100001
100
1000
10289
1289
01 2345-6789
14
15
16
17
18
19
20
106
600
100
100
1 miliar
123
123
24 maret
10076
100076
10076 rupiah
76
+62 21 6539-0605
\ No newline at end of file
dua ribu dua puluh dua
tiga ribu
sembilan ribu sembilan ratus sembilan puluh sembilan
seribu satu
ribu
seribu
seribu dua ratus delapan puluh sembilan
ribu dua ratus delapan puluh sembilan
nol satu dua tiga empat lima enam tujuh delapan sembilan
empat belas
limabelas
enambelas
tujuh belas
delapan belas
sembilan belas
dua puluh
seratus enam
enam ratus
ratus
seratus
satu miliar
seratus dua puluh tiga
ratus dua puluh tiga
dua puluh empat maret
ribu tujuh puluh enam
seribu tujuh puluh enam
ribu tujuh puluh enam rupiah
tujuh puluh enam
ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment