Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
1 59
2 58
3 57
4 56
5 55
6 54
7 53
8 52
9 51
10 50
11 49
12 48
13 47
14 46
15 45
16 44
17 43
18 42
19 41
20 40
21 39
22 38
23 37
24 36
25 35
26 34
27 33
28 32
29 31
30 30
31 29
32 28
33 27
34 26
35 25
36 24
37 23
38 22
39 21
40 20
41 19
42 18
43 17
44 16
45 15
46 14
47 13
48 12
49 11
50 10
51 9
52 8
53 7
54 6
55 5
56 4
57 3
58 2
59 1
일분 01
이분 02
삼분 03
사분 04
오분 05
육분 06
칠분 07
팔분 08
구분 09
십분 10
십일분 11
십이분 12
십삼분 13
십사분 14
십오분 15
십육분 16
십칠분 17
십팔분 18
십구분 19
이십분 20
이십일분 21
이십이분 22
이십삼분 23
이십사분 24
이십오분 25
이십육분 26
이십칠분 27
이십팔분 28
이십구분 29
삼십분 30
삼십일분 31
삼십이분 32
삼십삼분 33
삼십사분 34
삼십오분 35
삼십육분 36
삼십칠분 37
삼십팔분 38
삼십구분 39
사십분 40
사십일분 41
사십이분 42
사십삼분 43
사십사분 44
사십오분 45
사십육분 46
사십칠분 47
사십팔분 48
사십구분 49
오십분 50
오십일분 51
오십이분 52
오십삼분 53
오십사분 54
오십오분 55
오십육분 56
오십칠분 57
오십팔분 58
오십구분 59
일초 01
이초 02
삼초 03
사초 04
오초 05
육초 06
칠초 07
팔초 08
구초 09
십초 10
십일초 11
십이초 12
십삼초 13
십사초 14
십오초 15
십육초 16
십칠초 17
십팔초 18
십구초 19
이십초 20
이십일초 21
이십이초 22
이십삼초 23
이십사초 24
이십오초 25
이십육초 26
이십칠초 27
이십팔초 28
이십구초 29
삼십초 30
삼십일초 31
삼십이초 32
삼십삼초 33
삼십사초 34
삼십오초 35
삼십육초 36
삼십칠초 37
삼십팔초 38
삼십구초 39
사십초 40
사십일초 41
사십이초 42
사십삼초 43
사십사초 44
사십오초 45
사십육초 46
사십칠초 47
사십팔초 48
사십구초 49
오십초 50
오십일초 51
오십이초 52
오십삼초 53
오십사초 54
오십오초 55
오십육초 56
오십칠초 57
오십팔초 58
오십구초 59
p m p.m.
pm p.m.
p.m.
p.m p.m.
am a.m.
a.m.
a.m a.m.
a m a.m.
\ No newline at end of file
cst c s t
cet c e t
pst p s t
est e s t
pt p t
et e t
gmt g m t
one 12
two 1
three 2
four 3
five 4
six 5
seven 6
eigh 7
nine 8
ten 9
eleven 10
twelve 11
\ No newline at end of file
e.g. for example
dr. doctor
mr. mister
mrs. misses
st. saint
7-eleven seven eleven
es3 e s three
s&p s and p
ASAP a s a p
AT&T a t and t
LLP l l p
ATM a t m
import os
import string
from pathlib import Path
from typing import Dict
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from pynini import Far
from pynini.examples import plurals
from pynini.export import export
from pynini.lib import byte, pynutil, utf8
DAMO_CHAR = utf8.VALID_UTF8_CHAR
DAMO_DIGIT = byte.DIGIT
DAMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
DAMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
DAMO_HEX = pynini.union(*string.hexdigits).optimize()
DAMO_NON_BREAKING_SPACE = "\u00A0"
DAMO_SPACE = " "
DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
DAMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
DAMO_GRAPH = pynini.union(DAMO_ALNUM, DAMO_PUNCT).optimize()
DAMO_SIGMA = pynini.closure(DAMO_CHAR)
delete_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE))
delete_zero_or_one_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE, 0, 1))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(' field_order: "') + DAMO_NOT_QUOTE + pynutil.delete('"'))
)
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b",
"c",
"d",
"f",
"g",
"h",
"j",
"k",
"l",
"m",
"n",
"p",
"q",
"r",
"s",
"t",
"v",
"w",
"x",
"y",
"z",
)
_ies = DAMO_SIGMA + _c + pynini.cross("y", "ies")
_es = DAMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = DAMO_SIGMA + pynutil.insert("s")
graph_plural = plurals._priority_union(
suppletive,
plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA),
DAMO_SIGMA,
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
TO_LOWER = pynini.union(
*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
)
TO_UPPER = pynini.invert(TO_LOWER)
MIN_NEG_WEIGHT = -0.0001
MIN_POS_WEIGHT = 0.0001
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
Args:
file_name: exported file name
graphs: Mapping of a rule name and Pynini WFST graph to be exported
"""
exporter = export.Exporter(file_name)
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
print(f"Created {file_name}")
def get_plurals(fst):
"""
Given singular returns plurals
Args:
fst: Fst
Returns plurals to given singular forms
"""
return SINGULAR_TO_PLURAL @ fst
def get_singulars(fst):
"""
Given plural returns singulars
Args:
fst: Fst
Returns singulars to given plural forms
"""
return PLURAL_TO_SINGULAR @ fst
def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
Args:
fst: input fst
Returns output fst where breaking spaces are converted to non breaking spaces
"""
return fst @ pynini.cdrewrite(
pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
)
class GraphFst:
"""
Base class for all grammar fsts.
Args:
name: name of grammar class
kind: either 'classify' or 'verbalize'
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, name: str, kind: str, deterministic: bool = True):
self.name = name
self.kind = kind
self._fst = None
self.deterministic = deterministic
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
self._fst = Far(
self.far_path, mode="r", arc_type="standard", far_type="default"
).get_fst()
def far_exist(self) -> bool:
"""
Returns true if FAR can be loaded
"""
return self.far_path.exists()
@property
def fst(self) -> "pynini.FstLike":
return self._fst
@fst.setter
def fst(self, fst):
self._fst = fst
def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
res = (
pynutil.delete(f"{self.name}")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ fst
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_ALPHA,
DAMO_SIGMA,
DAMO_DIGIT,
DAMO_SPACE,
DAMO_CHAR,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
Numbers below thirteen are not converted.
"""
def __init__(self):
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_teens_without_zero = pynini.string_file(
get_abs_path("data/numbers/digit_teens_without_zero.tsv")
)
graph_teens = pynini.string_file(get_abs_path("data/numbers/digit_teens.tsv"))
graph_inh_digit = pynini.string_file(get_abs_path("data/numbers/digit_inherent_digit.tsv"))
graph_inh_teen_without_zero = pynini.string_file(
get_abs_path("data/numbers/digit_inherent_teens_without_zero.tsv")
)
graph_inh_teen = pynini.string_file(get_abs_path("data/numbers/digit_inherent_teens.tsv"))
graph_inh_teen_others = pynini.string_file(
get_abs_path("data/numbers/digit_inherent_others.tsv")
)
graph_less_hundred_num_inh_p1 = graph_inh_teen_without_zero + graph_inh_digit
graph_less_hundred_num_inh = pynini.union(
graph_inh_teen, graph_less_hundred_num_inh_p1, graph_inh_teen_others
)
graph_less_hundred_num_p1 = graph_teens_without_zero + graph_digit
graph_less_hundred_num = pynini.union(graph_less_hundred_num_p1, graph_teens)
# digits
addzero = pynutil.insert("0")
zero = graph_zero
digits_combine = graph_digit | graph_inh_digit | zero
digits = graph_digit | zero
digit = graph_digit
# teens
teens_combine = graph_less_hundred_num | graph_less_hundred_num_inh
# teens = graph_less_hundred_num
teens = teens_combine
# hundred, #백 单位 百
hundred = (
digit
+ pynutil.delete("백")
+ (
teens
| pynutil.add_weight(zero + digit, 0.1)
| pynutil.add_weight(digit + addzero, 0.5)
| pynutil.add_weight(addzero**2, 1.0)
)
)
graph_hundred_component_at_least_one_none_zero_digit = hundred @ (
pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
)
self.graph_hundred_component_at_least_one_none_zero_digit = (
graph_hundred_component_at_least_one_none_zero_digit
)
##thousand 천 千单位
thousand = (
(hundred | teens | digits)
+ pynutil.delete("천")
+ (
hundred
| pynutil.add_weight(zero + teens, 0.1)
| pynutil.add_weight(addzero + zero + digit, 0.5)
| pynutil.add_weight(digit + addzero**2, 0.8)
| pynutil.add_weight(addzero**3, 1.0)
)
)
##만 单位万
ten_thousand = (
(thousand | hundred | teens | digits)
+ pynutil.delete("만")
+ pynini.cross(" ", "").ques
+ (
thousand
| pynutil.add_weight(zero + hundred, 0.1)
| pynutil.add_weight(addzero + zero + teens, 0.5)
| pynutil.add_weight(addzero + addzero + zero + digit, 0.5)
| pynutil.add_weight(digit + addzero**3, 0.8)
| pynutil.add_weight(addzero**4, 1.0)
)
)
##조, 单位兆, 억, 单位亿
number = digits | teens | hundred | thousand | ten_thousand
## ques is equal to pynini.closure(, 0, 1)
number = (
(number + pynini.accep("조").ques + pynini.cross(" ", "").ques).ques
+ (number + pynini.accep("억").ques + pynini.cross(" ", "").ques).ques
+ number
)
graph = (
number
| graph_less_hundred_num_inh
| graph_inh_digit
| graph_inh_teen
| graph_inh_teen_others
)
# labels_exception = [num_to_word(x) for x in range(0, 13)]
labels_exception = ["zzzzzzzzz"]
graph_exception = pynini.union(*labels_exception)
self.graph_no_exception = graph
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("마이너스", '"-"') + DAMO_SPACE, 0, 1
)
final_graph = (
optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_ALPHA,
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize()
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
graph_digit_inh = pynini.string_file(
get_abs_path("data/numbers/digit_inherent_digit.tsv")
).optimize()
def _get_month_graph():
"""
Transducer for month, e.g. march -> march
"""
month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
# print(month_graph)
return month_graph
def _get_day_graph():
"""
Transducer for month, e.g. march -> march
"""
day_graph_num = pynini.string_file(get_abs_path("data/day.tsv"))
day_graph_inh = pynini.string_file(get_abs_path("data/day_inherent.tsv"))
day_graph = pynini.union(day_graph_num, day_graph_inh)
# print(day_graph)
return day_graph
def _get_year_graph():
"""
Transducer for year, e.g. twenty twenty -> 2020
"""
digit = graph_digit | graph_digit_inh
zero = graph_zero
year_graph_4num = digit + (digit | zero) ** 3
year_graph_2num = digit**2
year_graph = pynini.union(year_graph_4num, year_graph_2num)
return year_graph
class DateFst(GraphFst):
"""
Finite state transducer for classifying date,
e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
e.g. twenty twenty -> date { year: "2012" preserve_order: true }
Args:
ordinal: OrdinalFst
"""
def __init__(self):
super().__init__(name="date", kind="classify")
year_graph = _get_year_graph() + pynini.accep("년")
YEAR_WEIGHT = 0.001
year_graph = (
pynutil.insert('year: "')
+ pynutil.add_weight(year_graph, YEAR_WEIGHT)
+ pynutil.insert('"')
)
# year_graph_space = pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"") + pynutil.insert(" ")
# year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")
MONTH_WEIGHT = -0.001
month_graph = _get_month_graph() + pynini.cross("", "월")
# month_graph = pynutil.insert("month: \"") + pynutil.add_weight(month_graph, MONTH_WEIGHT) + pynutil.insert("\"")
month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
# month_graph_space = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") + pynutil.insert(" ")
day_graph = _get_day_graph() + pynini.cross("", "일")
DAY_WEIGHT = -0.7
# day_graph = pynutil.insert("day: \"") + pynutil.add_weight(day_graph, DAY_WEIGHT) + pynutil.insert("\"")
day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"')
# day_graph_space = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") + pynutil.insert(" ")
graph_ymd = year_graph + delete_space + month_graph + delete_space + day_graph
graph_md = month_graph + delete_space + day_graph
graph_ym = year_graph + delete_space + month_graph
final_graph = graph_ymd | graph_md | graph_ym | year_graph | month_graph | day_graph
final_graph += pynutil.insert(" preserve_order: true")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
def get_quantity(
decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike"
) -> "pynini.FstLike":
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
numbers = cardinal_up_to_hundred @ (
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT)
)
# "만", "백만", "천만", "억", "조", 万、百万、千万、亿、兆
# 천 千
suffix = pynini.union("만", "백만", "천만", "억", "조")
res = (
pynutil.insert('integer_part: "')
+ numbers
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('quantity: "')
+ suffix
+ pynutil.insert('"')
)
res |= (
decimal
+ delete_extra_space
+ pynutil.insert('quantity: "')
+ (suffix | "천")
+ pynutil.insert('"')
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal
e.g. minus twelve point five o o six billion -> decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" }
e.g. one billion -> decimal { integer_part: "1" quantity: "billion" }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="decimal", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_decimal = pynini.closure(graph_decimal)
self.graph = graph_decimal
##마이너스 负
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("마이너스", '"true"') + delete_extra_space,
0,
1,
)
graph_fractional = (
pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
)
# 점 点
graph_integer = (
pynutil.insert('integer_part: "')
+ cardinal_graph
+ pynutil.delete("점")
+ pynutil.insert('"')
)
final_graph_wo_sign = graph_integer + pynini.cross(" ", " ") + graph_fractional
final_graph = optional_graph_negative + delete_space + final_graph_wo_sign
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
final_graph |= optional_graph_negative + get_quantity(
final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_ALPHA,
GraphFst,
insert_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. c d f one at a b c dot e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
"""
def __init__(self):
super().__init__(name="electronic", kind="classify")
delete_extra_space = pynutil.delete(" ")
alpha_num = (
DAMO_ALPHA
| pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
)
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
accepted_username = alpha_num | symbols
process_dot = pynini.cross("점", ".")
username = (
alpha_num + pynini.closure(delete_extra_space + accepted_username)
) | pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
username = pynutil.insert('username: "') + username + pynutil.insert('"')
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
server = single_alphanum | pynini.string_file(
get_abs_path("data/electronic/server_name.tsv")
)
domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
domain_graph = (
pynutil.insert('domain: "')
+ server
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ domain
+ pynutil.insert('"')
)
graph = (
username
+ delete_extra_space
+ pynutil.delete("에서")
+ insert_space
+ delete_extra_space
+ domain_graph
)
############# url ###
protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
protocol_start = (
pynini.cross("h t t p", "http") | pynini.cross("h t t p s", "https")
) + pynini.cross(" 콜론 슬래시 슬래시 ", "://")
# .com,
ending = (
delete_extra_space
+ symbols
+ delete_extra_space
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)
protocol_default = (
(
(pynini.closure(delete_extra_space + accepted_username, 1) | server)
| pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
)
+ pynini.closure(ending, 1)
).optimize()
protocol = (
pynini.closure(protocol_start, 0, 1)
+ protocol_end
+ delete_extra_space
+ process_dot
+ protocol_default
).optimize()
protocol |= (
pynini.closure(protocol_end + delete_extra_space + process_dot, 0, 1) + protocol_default
)
protocol = pynutil.insert('protocol: "') + protocol.optimize() + pynutil.insert('"')
graph |= protocol
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
convert_space,
delete_space,
delete_extra_space,
DAMO_SIGMA,
DAMO_CHAR,
DAMO_SPACE,
)
import pynini
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for classifying fraction
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="fraction", kind="classify")
# integer_part # numerator # denominator
graph_cardinal = cardinal.graph_no_exception
# without the integerate part
# 分子
numerator = pynutil.insert('numerator: "') + graph_cardinal + pynutil.insert('"')
# 分母
denominator = (
pynutil.insert('denominator: "')
+ graph_cardinal
+ pynutil.delete("분의")
+ pynutil.insert('"')
)
##
graph_fraction_component = denominator + pynini.cross(" ", " ") + numerator
self.graph_fraction_component = graph_fraction_component
graph = graph_fraction_component
graph = graph.optimize()
self.final_graph_wo_negative = graph
##负
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("마이너스", '"true"') + DAMO_SPACE, 0, 1
)
graph = optional_graph_negative + graph
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
get_singulars,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure
e.g. minus twelve kilograms -> measure { negative: "true" cardinal { integer: "12" } units: "kg" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")
cardinal_graph = cardinal.graph_no_exception
decimal_graph = decimal.final_graph_wo_negative
unit_graph = pynini.string_file(get_abs_path("data/measurements.tsv"))
graph_unit = pynini.invert(unit_graph) # singular -> abbr
## 마이너 负
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("마이너", '"true"') + delete_extra_space,
0,
1,
)
graph_units = pynutil.insert('units: "') + graph_unit + pynutil.insert('"')
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal_graph
+ pynutil.insert(" }")
+ delete_extra_space
+ graph_units
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ cardinal_graph
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ graph_units
)
final_graph = subgraph_decimal | subgraph_cardinal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_DIGIT,
DAMO_NOT_SPACE,
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
get_singulars,
insert_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money
e.g. twelve dollars and five cents -> money { integer_part: "12" fractional_part: 05 currency: "$" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="money", kind="classify")
# quantity, integer_part, fractional_part, currency
cardinal_graph = cardinal.graph_no_exception
decimal_graph = decimal.final_graph_wo_negative
unit = pynini.string_file(get_abs_path("data/currency.tsv")).invert()
graph_unit = pynutil.insert('currency: "') + unit + pynutil.insert('"')
graph_integer = (
pynutil.insert('integer_part: "')
+ cardinal_graph
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit
)
graph_decimal = decimal_graph + pynutil.insert(" ") + graph_unit
final_graph = graph_integer | graph_decimal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst
from pynini.lib import pynutil
class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def __init__(self):
super().__init__(name="punctuation", kind="classify")
s = ",.?" # here, we only support three type of punctuation
punct = pynini.union(*s)
graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
DAMO_ALNUM,
DAMO_ALPHA,
DAMO_DIGIT,
GraphFst,
insert_space,
)
from pynini.lib import pynutil
def get_serial_number(cardinal):
"""
any alphanumerical character sequence with at least one number with length greater equal to 3
"""
digit = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT)
character = digit | DAMO_ALPHA
sequence = character + pynini.closure(pynutil.delete(" ") + character, 2)
sequence = sequence @ (pynini.closure(DAMO_ALNUM) + DAMO_DIGIT + pynini.closure(DAMO_ALNUM))
return sequence.optimize()
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
one two three one two three five six seven eight -> { number_part: "123-123-5678" }
This class also support card number and IP format.
"one two three dot one double three dot o dot four o" -> { number_part: "123.133.0.40"}
"three two double seven three two one four three two one four three double zero five" ->
{ number_part: 3277 3214 3214 3005}
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="telephone", kind="classify")
# country code, number_part, extension
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_dot = pynini.string_file(get_abs_path("data/numbers/dot.tsv"))
graph_digits = graph_digit | graph_zero
phone_number_graph = graph_digits**9 | graph_digits**10 | graph_digits**11
country_code = (
pynutil.insert('country_code: "')
+ pynini.closure(pynini.cross("더한", "+"), 0, 1)
+ (pynini.closure(graph_digits, 0, 2) + graph_digits)
+ pynutil.insert('"')
)
optional_country_code = pynini.closure(
country_code + pynutil.delete(" ") + insert_space, 0, 1
).optimize()
grpah_phone_number = (
pynutil.insert('number_part: "') + phone_number_graph + pynutil.insert('"')
)
graph = optional_country_code + grpah_phone_number
# ip
ip_graph = graph_digit.plus + (graph_dot + graph_digits.plus).plus
graph |= pynutil.insert('number_part: "') + ip_graph.optimize() + pynutil.insert('"')
graph |= (
pynutil.insert('number_part: "')
+ pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001)
+ pynutil.insert('"')
)
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.ko.utils import get_abs_path, num_to_word
from fun_text_processing.inverse_text_normalization.ko.graph_utils import (
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time
e.g. twelve thirty -> time { hours: "12" minutes: "30" }
e.g. twelve past one -> time { minutes: "12" hours: "1" }
e.g. two o clock a m -> time { hours: "2" suffix: "a.m." }
e.g. quarter to two -> time { hours: "1" minutes: "45" }
e.g. quarter past two -> time { hours: "2" minutes: "15" }
e.g. half past two -> time { hours: "2" minutes: "30" }
"""
def __init__(self):
super().__init__(name="time", kind="classify")
# hours, minutes, seconds, suffix, zone, style, speak_period
suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))
time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
hour_graph = pynini.string_file(get_abs_path("data/time/hours.tsv"))
minute_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv"))
second_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv"))
# only used for < 1000 thousand -> 0 weight
# cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7)
graph_hour = hour_graph
graph_minute = minute_graph
graph_second = second_graph
final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"')
final_suffix = (
pynutil.insert('suffix: "') + convert_space(suffix_graph) + pynutil.insert('"')
)
final_suffix = delete_space + insert_space + final_suffix
final_suffix_optional = pynini.closure(final_suffix, 0, 1)
final_time_zone_optional = pynini.closure(
delete_space
+ insert_space
+ pynutil.insert('zone: "')
+ convert_space(time_zone_graph)
+ pynutil.insert('"'),
0,
1,
)
graph_hm = (
final_graph_hour
+ delete_extra_space
+ pynutil.insert('minutes: "')
+ graph_minute
+ pynutil.insert('"')
)
graph_hms = (
final_graph_hour
+ delete_extra_space
+ pynutil.insert('minutes: "')
+ graph_minute
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('seconds: "')
+ graph_second
+ pynutil.insert('"')
)
graph_h = (
final_graph_hour
+ delete_extra_space
+ pynutil.insert('minutes: "')
+ (pynutil.insert("00") | graph_minute)
+ pynutil.insert('"')
+ final_suffix
+ final_time_zone_optional
)
final_graph = (graph_hm | graph_hms) + final_suffix_optional + final_time_zone_optional
final_graph |= graph_h
final_graph = self.add_tokens(final_graph.optimize())
self.fst = final_graph.optimize()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment