"src/targets/vscode:/vscode.git/clone" did not exist on "08360e83dbe62477a9d91f3cc9fe7ecd5a1e000d"
Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
centésimo 1
centésima 1
ducentésimo 2
ducentésima 2
tricentésimo 3
tricentésima 3
trecentésimo 3
trecentésima 3
quadringentésimo 4
quadringentésima 4
quingentésimo 5
quingentésima 5
sexcentésimo 6
sexcentésima 6
seiscentésimo 6
seiscentésima 6
septingentésimo 7
septingentésima 7
setingentésimo 7
setingentésima 7
octingentésimo 8
octingentésima 8
octogentésimo 8
octogentésima 8
noningentésimo 9
noningentésima 9
nongentésimo 9
nongentésima 9
\ No newline at end of file
décimo 1
décima 1
vigésimo 2
vigésima 2
trigésimo 3
trigésima 3
quadragésimo 4
quadragésima 4
quinquagésimo 5
quinquagésima 5
sexagésimo 6
sexagésima 6
septuagésimo 7
septuagésima 7
setuagésimo 7
setuagésima 7
octogésimo 8
octogésima 8
nonagésimo 9
nonagésima 9
\ No newline at end of file
0 23
2 1
3 2
4 3
5 4
6 5
7 6
8 7
9 8
10 9
11 10
12 11
13 12
14 13
15 14
16 15
17 16
18 17
19 18
20 19
21 20
22 21
23 22
\ No newline at end of file
01 59
02 58
03 57
04 56
05 55
06 54
07 53
08 52
09 51
10 50
11 49
12 48
13 47
14 46
15 45
16 44
17 43
18 42
19 41
20 40
21 39
22 38
23 37
24 36
25 35
26 34
27 33
28 32
29 31
30 30
31 29
32 28
33 27
34 26
35 25
36 24
37 23
38 22
39 21
40 20
41 19
42 18
43 17
44 16
45 15
46 14
47 13
48 12
49 11
50 10
51 09
52 08
53 07
54 06
55 05
56 04
57 03
58 02
59 01
\ No newline at end of file
da madrugada da madrugada
da manhã da manhã
\ No newline at end of file
da tarde da tarde
da noite da noite
\ No newline at end of file
segunda-feira segunda feira
terça-feira terça feira
quarta-feira quarta feira
quinta-feira quinta feira
sexta-feira sexta feira
\ No newline at end of file
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_ALPHA,
DAMO_DIGIT,
DAMO_SIGMA,
DAMO_SPACE,
DAMO_WHITE_SPACE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"}
This class converts cardinals up to (but not including) "un cuatrillón",
i.e up to "one septillion" in English (10^{24}).
Cardinals below ten are not converted (in order to avoid
"vivo em uma casa" --> "vivo em 1 casa" and any other odd conversions.)
Although technically Portuguese grammar requires that "e" only comes after
"10s" numbers (ie. "trinta", ..., "noventa"), these rules will convert
numbers even with "e" in an ungrammatical place (because "e" is ignored
inside cardinal numbers).
e.g. "mil e uma" -> cardinal { integer: "1001"}
e.g. "cento e uma" -> cardinal { integer: "101"}
"""
def __init__(self, use_strict_e=False):
"""
:param use_strict_e: When True forces to have the separator "e" in the right places
"""
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_one_hundred = pynini.string_file(get_abs_path("data/numbers/onehundred.tsv"))
graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv"))
graph = None
if not use_strict_e:
graph_hundred_component = graph_hundreds | pynutil.insert("0")
graph_hundred_component += delete_space
graph_hundred_component += pynini.union(
graph_twenties | graph_teen | pynutil.insert("00"),
(graph_ties | pynutil.insert("0"))
+ delete_space
+ (graph_digit | pynutil.insert("0")),
)
graph_hundred_component = pynini.union(graph_hundred_component, graph_one_hundred)
graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
)
graph_thousands = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("mil"),
pynutil.insert("001")
+ pynutil.delete("mil"), # because we say 'mil', not 'hum mil'
pynutil.insert("000", weight=0.01),
)
graph_milhoes = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("milhão") | pynutil.delete("milhões")),
pynutil.insert("000", weight=0.01),
)
graph_bilhoes = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("bilhão") | pynutil.delete("bilhões")),
pynutil.insert("000", weight=0.01),
)
graph_trilhoes = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("trilhão") | pynutil.delete("trilhões")),
pynutil.insert("000", weight=0.01),
)
graph_quatrilhoes = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("quatrilhão") | pynutil.delete("quatrilhões")),
pynutil.insert("000", weight=0.01),
)
graph_quintilhoes = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("quintilhão") | pynutil.delete("quintilhões")),
pynutil.insert("000", weight=0.01),
)
graph_sextilhoes = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("sextilhão") | pynutil.delete("sextilhões")),
pynutil.insert("000", weight=0.01),
)
graph = pynini.union(
graph_sextilhoes
+ delete_space
+ graph_quintilhoes
+ delete_space
+ graph_quatrilhoes
+ delete_space
+ graph_trilhoes
+ delete_space
+ graph_bilhoes
+ delete_space
+ graph_milhoes
+ delete_space
+ graph_thousands
+ delete_space
+ graph_hundred_component,
graph_zero,
)
graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT),
"0",
)
graph = (
pynini.cdrewrite(pynutil.delete("e"), DAMO_SPACE, DAMO_SPACE, DAMO_SIGMA)
@ (DAMO_ALPHA + DAMO_SIGMA)
@ graph
)
else:
graph_e = (
pynutil.delete(DAMO_WHITE_SPACE.plus)
+ pynutil.delete("e")
+ pynutil.delete(DAMO_WHITE_SPACE.plus)
)
graph_ties_component = pynini.union(
graph_teen | graph_twenties,
graph_ties + ((graph_e + graph_digit) | pynutil.insert("0")),
pynutil.add_weight(pynutil.insert("0") + graph_digit, 0.1),
) @ (pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT))
graph_hundreds_except_hundred = (
pynini.project(graph_hundreds, "input") - "cento"
) @ graph_hundreds
graph_hundred_component_prefix_e = pynini.union(
graph_one_hundred,
pynutil.add_weight(graph_hundreds_except_hundred + pynutil.insert("00"), 0.1),
pynutil.insert("0") + graph_ties_component,
) @ (pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT))
graph_hundred_component_prefix_e = graph_hundred_component_prefix_e.optimize()
graph_hundred_component_no_prefix = pynini.union(
graph_hundreds + graph_e + graph_ties_component,
) @ (pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT))
graph_hundred_component_no_prefix = graph_hundred_component_no_prefix.optimize()
graph_mil_prefix_e = pynini.union(
# because we say 'mil', not 'hum mil'
(
(graph_hundred_component_prefix_e + delete_space + pynutil.delete("mil"))
| (pynutil.insert("001", weight=0.1) + pynutil.delete("mil"))
)
+ (
(graph_e + graph_hundred_component_prefix_e)
| (delete_space + graph_hundred_component_no_prefix)
| pynutil.insert("000", weight=0.1)
)
)
graph_mil_no_prefix = pynini.union(
(
(graph_hundred_component_no_prefix + delete_space + pynutil.delete("mil"))
| pynutil.insert("000", weight=0.1)
)
+ (
(graph_e + graph_hundred_component_prefix_e)
| (delete_space + graph_hundred_component_no_prefix)
| pynutil.insert("000", weight=0.1)
)
)
graph_milhao_prefix_e = pynini.union(
(
graph_hundred_component_prefix_e
+ delete_space
+ (pynutil.delete("milhão") | pynutil.delete("milhões"))
)
+ ((graph_e + graph_mil_prefix_e) | (delete_space + graph_mil_no_prefix))
)
graph_milhao_no_prefix = pynini.union(
(
(
graph_hundred_component_no_prefix
+ delete_space
+ (pynutil.delete("milhão") | pynutil.delete("milhões"))
)
| pynutil.insert("000", weight=0.1)
)
+ ((graph_e + graph_mil_prefix_e) | (delete_space + graph_mil_no_prefix))
)
graph_bilhao_prefix_e = pynini.union(
(
graph_hundred_component_prefix_e
+ delete_space
+ (pynutil.delete("bilhão") | pynutil.delete("bilhões"))
)
+ ((graph_e + graph_milhao_prefix_e) | (delete_space + graph_milhao_no_prefix))
)
graph_bilhao_no_prefix = pynini.union(
(
(
graph_hundred_component_no_prefix
+ delete_space
+ (pynutil.delete("bilhão") | pynutil.delete("bilhões"))
)
| pynutil.insert("000", weight=0.1)
)
+ ((graph_e + graph_milhao_prefix_e) | (delete_space + graph_milhao_no_prefix))
)
graph_trilhao_prefix_e = pynini.union(
(
graph_hundred_component_prefix_e
+ delete_space
+ (pynutil.delete("trilhão") | pynutil.delete("trilhões"))
)
+ ((graph_e + graph_bilhao_prefix_e) | (delete_space + graph_bilhao_no_prefix))
)
graph_trilhao_no_prefix = pynini.union(
(
(
graph_hundred_component_no_prefix
+ delete_space
+ (pynutil.delete("trilhão") | pynutil.delete("trilhões"))
)
| pynutil.insert("000", weight=0.1)
)
+ ((graph_e + graph_bilhao_prefix_e) | (delete_space + graph_bilhao_no_prefix))
)
graph_quatrilhao_prefix_e = pynini.union(
(
graph_hundred_component_prefix_e
+ delete_space
+ (pynutil.delete("quatrilhão") | pynutil.delete("quatrilhões"))
)
+ ((graph_e + graph_trilhao_prefix_e) | (delete_space + graph_trilhao_no_prefix))
)
graph_quatrilhao_no_prefix = pynini.union(
(
(
graph_hundred_component_no_prefix
+ delete_space
+ (pynutil.delete("quatrilhão") | pynutil.delete("quatrilhões"))
)
| pynutil.insert("000", weight=0.1)
)
+ ((graph_e + graph_trilhao_prefix_e) | (delete_space + graph_trilhao_no_prefix))
)
graph_quintilhao_prefix_e = pynini.union(
(
graph_hundred_component_prefix_e
+ delete_space
+ (pynutil.delete("quintilhão") | pynutil.delete("quintilhões"))
)
+ (
(graph_e + graph_quatrilhao_prefix_e)
| (delete_space + graph_quatrilhao_no_prefix)
)
)
graph_quintilhao_no_prefix = pynini.union(
(
(
graph_hundred_component_no_prefix
+ delete_space
+ (pynutil.delete("quintilhão") | pynutil.delete("quintilhões"))
)
| pynutil.insert("000", weight=0.1)
)
+ (
(graph_e + graph_quatrilhao_prefix_e)
| (delete_space + graph_quatrilhao_no_prefix)
)
)
graph_sextilhao_prefix_e = pynini.union(
(
graph_hundred_component_prefix_e
+ delete_space
+ (pynutil.delete("sextilhão") | pynutil.delete("sextilhões"))
)
+ (
(graph_e + graph_quintilhao_prefix_e)
| (delete_space + graph_quintilhao_no_prefix)
)
)
graph_sextilhao_no_prefix = pynini.union(
(
(
graph_hundred_component_no_prefix
+ delete_space
+ (pynutil.delete("sextilhão") | pynutil.delete("sextilhões"))
)
| pynutil.insert("000", weight=0.1)
)
+ (
(graph_e + graph_quintilhao_prefix_e)
| (delete_space + graph_quintilhao_no_prefix)
)
)
graph = pynini.union(
graph_sextilhao_no_prefix,
graph_sextilhao_prefix_e,
graph_quintilhao_prefix_e,
graph_quatrilhao_prefix_e,
graph_trilhao_prefix_e,
graph_bilhao_prefix_e,
graph_milhao_prefix_e,
graph_mil_prefix_e,
graph_hundred_component_prefix_e,
graph_ties_component,
graph_zero,
).optimize()
graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT),
"0",
)
graph = graph.optimize()
self.graph_no_exception = graph
# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand = DAMO_DIGIT | (DAMO_DIGIT**2) | (DAMO_DIGIT**3)
numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize()
self.numbers_up_to_thousand = numbers_up_to_thousand
# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million = (
DAMO_DIGIT
| (DAMO_DIGIT**2)
| (DAMO_DIGIT**3)
| (DAMO_DIGIT**4)
| (DAMO_DIGIT**5)
| (DAMO_DIGIT**6)
)
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
self.numbers_up_to_million = numbers_up_to_million
# save self.digits_from_year for use in DateFst
digits_1_2099 = [str(digits) for digits in range(1, 2100)]
digits_from_year = (numbers_up_to_million @ pynini.union(*digits_1_2099)).optimize()
self.digits_from_year = digits_from_year
# don't convert cardinals from zero to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("menos", '"-"') + DAMO_SPACE, 0, 1
)
final_graph = (
optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for classifying date,
e.g. primeiro de janeiro -> date { day: "1" month: "janeiro" }
e.g. um de janeiro -> date { day: "1" month: "janeiro" }
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="date", kind="classify")
digits_from_year = cardinal.digits_from_year
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_1_to_100 = pynini.union(
pynutil.insert("0") + graph_digit,
graph_twenties,
graph_teen,
(graph_ties + pynutil.insert("0")),
(graph_ties + pynutil.delete(" e ") + graph_digit),
)
digits_1_to_31 = [str("{:0>2d}").format(digits) for digits in range(1, 32)]
graph_1_to_31 = graph_1_to_100 @ pynini.union(*digits_1_to_31)
# can use "primeiro" for 1st day of the month
graph_1_to_31 = pynini.union(graph_1_to_31, pynini.cross("primeiro", "01"))
day_graph = pynutil.insert('day: "') + graph_1_to_31 + pynutil.insert('"')
month_name_graph = pynini.string_file(get_abs_path("data/months.tsv"))
month_name_graph = pynutil.insert('month: "') + month_name_graph + pynutil.insert('"')
# vinte do oito -> 20/08
digits_1_to_12 = [str("{:0>2d}").format(digits) for digits in range(1, 13)]
graph_1_to_12 = graph_1_to_100 @ pynini.union(*digits_1_to_12)
month_number_graph = pynutil.insert('month: "') + graph_1_to_12 + pynutil.insert('"')
graph_dm = (
day_graph + delete_space + pynutil.delete("de") + delete_extra_space + month_name_graph
)
graph_dm |= (
day_graph
+ delete_space
+ pynutil.delete("do")
+ delete_extra_space
+ month_number_graph
+ pynutil.insert(' morphosyntactic_features: "/"')
)
graph_year = (
delete_space
+ pynutil.delete("de")
+ delete_extra_space
+ pynutil.insert('year: "')
+ digits_from_year
+ pynutil.insert('"')
)
graph_dmy = graph_dm + graph_year.ques
final_graph = graph_dmy
final_graph += pynutil.insert(" preserve_order: true")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
def get_quantity(
decimal: "pynini.FstLike", cardinal_up_to_million: "pynini.FstLike"
) -> "pynini.FstLike":
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_million: cardinal FST
"""
numbers = cardinal_up_to_million @ (
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT)
)
suffix = pynini.union(
"milhão",
"milhões",
"bilhão",
"bilhões",
"trilhão",
"trilhões",
"quatrilhão",
"quatrilhões",
"quintilhão",
"quintilhões",
"sextilhão",
"sextilhões",
)
res = (
pynutil.insert('integer_part: "')
+ numbers
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('quantity: "')
+ suffix
+ pynutil.insert('"')
)
res |= (
decimal + delete_extra_space + pynutil.insert('quantity: "') + suffix + pynutil.insert('"')
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal
Decimal point is either "." or ",", determined by whether "ponto" or "vírgula" is spoken.
e.g. menos um vírgula dois seis -> decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" }
e.g. menos um ponto dois seis -> decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" }
This decimal rule assumes that decimals can be pronounced as:
(a cardinal) + ('vírgula' or 'ponto') plus (any sequence of cardinals <1000, including 'zero')
Also writes large numbers in shortened form, e.g.
e.g. um vírgula dois seis milhões -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "milhões" }
e.g. dois milhões -> decimal { negative: "false" integer_part: "2" quantity: "milhões" }
e.g. mil oitcentos e vinte e quatro milhões -> decimal { negative: "false" integer_part: "1824" quantity: "milhões" }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="decimal", kind="classify")
# number after decimal point can be any series of cardinals <1000, including 'zero'
graph_decimal = cardinal.numbers_up_to_thousand
graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
self.graph = graph_decimal
# decimal point can be denoted by 'vírgula' or 'ponto'
decimal_point = pynini.cross("vírgula", 'morphosyntactic_features: ","')
decimal_point |= pynini.cross("ponto", 'morphosyntactic_features: "."')
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("menos", '"true"') + delete_extra_space,
0,
1,
)
graph_fractional = (
pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
)
cardinal_graph = cardinal.graph_no_exception | pynini.string_file(
get_abs_path("data/numbers/zero.tsv")
)
graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
final_graph_wo_sign = (
pynini.closure(graph_integer + delete_extra_space, 0, 1)
+ decimal_point
+ delete_extra_space
+ graph_fractional
)
final_graph = optional_graph_negative + final_graph_wo_sign
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
final_graph_wo_sign, cardinal.numbers_up_to_million
)
final_graph |= optional_graph_negative + get_quantity(
final_graph_wo_sign, cardinal.numbers_up_to_million
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import DAMO_ALPHA, GraphFst, insert_space
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying 'electronic' semiotic classes, i.e.
email address (which get converted to "username" and "domain" fields),
and URLS (which get converted to a "protocol" field).
e.g. c d f um arroba a b c ponto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
e.g. dáblio dáblio dáblio a b c ponto e d u -> tokens { electronic { protocol: "www.abc.edu" } }
"""
def __init__(self):
super().__init__(name="electronic", kind="classify")
delete_extra_space = pynutil.delete(" ")
alpha_num = (
DAMO_ALPHA
| pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
)
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
accepted_username = alpha_num | symbols
process_dot = pynini.cross("ponto", ".")
username = (
pynutil.insert('username: "')
+ alpha_num
+ delete_extra_space
+ pynini.closure(accepted_username + delete_extra_space)
+ alpha_num
+ pynutil.insert('"')
)
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
server = (
single_alphanum
| pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert()
)
domain = (
single_alphanum
| pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert()
)
domain_graph = (
pynutil.insert('domain: "')
+ server
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ domain
+ pynutil.insert('"')
)
graph = (
username
+ delete_extra_space
+ pynutil.delete("arroba")
+ insert_space
+ delete_extra_space
+ domain_graph
)
############# url ###
protocol_end = pynini.cross(pynini.union("www", "w w w", "dáblio dáblio dáblio"), "www")
protocol_start = pynini.cross(pynini.union("http", "h t t p", "agá tê tê pê"), "http")
protocol_start |= pynini.cross(
pynini.union("https", "h t t p s", "agá tê tê pê ésse"), "https"
)
protocol_start += pynini.cross(" dois pontos barra barra ", "://")
# e.g. .com, .es
ending = (
delete_extra_space
+ symbols
+ delete_extra_space
+ (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)
)
protocol = (
pynini.closure(protocol_start, 0, 1)
+ protocol_end
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+ pynini.closure(ending, 1)
)
protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"')
graph |= protocol
########
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure
e.g. menos doze quilogramas -> measure { cardinal { negative: "true" integer: "12" } units: "kg" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_unit_singular = pynini.string_file(
get_abs_path("data/measurements_singular.tsv")
).invert()
graph_unit_plural = pynini.string_file(
get_abs_path("data/measurements_plural.tsv")
).invert()
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("menos", '"true"') + delete_extra_space,
0,
1,
)
unit_singular = convert_space(graph_unit_singular)
unit_plural = convert_space(graph_unit_plural)
unit_misc = (
pynutil.insert("/")
+ pynutil.delete("por")
+ delete_space
+ convert_space(graph_unit_singular)
)
unit_singular = (
pynutil.insert('units: "')
+ (
unit_singular
| unit_misc
| pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)
)
+ pynutil.insert('"')
)
unit_plural = (
pynutil.insert('units: "')
+ (
unit_plural
| unit_misc
| pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)
)
+ pynutil.insert('"')
)
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal.final_graph_wo_negative
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_plural
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ ((DAMO_SIGMA - "um" - "uma") @ cardinal_graph)
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_plural
)
subgraph_cardinal |= (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ (pynini.cross("um", "1") | pynini.cross("uma", "1"))
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_singular
)
final_graph = subgraph_decimal | subgraph_cardinal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money
e.g. doze dólares e cinco centavos -> money { integer_part: "12" fractional_part: "05" currency: "$" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="money", kind="classify")
# quantity, integer_part, fractional_part, currency
cardinal_graph = cardinal.graph_no_exception
graph_decimal_final = decimal.final_graph_wo_negative
unit_singular = pynini.string_file(get_abs_path("data/currency_singular.tsv")).invert()
unit_plural = pynini.string_file(get_abs_path("data/currency_plural.tsv")).invert()
graph_unit_singular = (
pynutil.insert('currency: "') + convert_space(unit_singular) + pynutil.insert('"')
)
graph_unit_plural = (
pynutil.insert('currency: "') + convert_space(unit_plural) + pynutil.insert('"')
)
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
# twelve dollars (and) fifty cents, zero cents
cents_standalone = (
pynutil.insert('morphosyntactic_features: ","') # always use a comma in the decimal
+ insert_space
+ pynutil.insert('fractional_part: "')
+ pynini.union(
pynutil.add_weight(((DAMO_SIGMA - "um" - "uma") @ cardinal_graph), -0.7)
@ add_leading_zero_to_double_digit
+ delete_space
+ pynutil.delete(pynini.union("centavos")),
pynini.cross("um", "01") + delete_space + pynutil.delete(pynini.union("centavo")),
)
+ pynutil.insert('"')
)
optional_cents_standalone = pynini.closure(
delete_space
+ pynini.closure((pynutil.delete("com") | pynutil.delete("e")) + delete_space, 0, 1)
+ insert_space
+ cents_standalone,
0,
1,
)
# twelve dollars fifty, only after integer
# setenta e cinco dólares com sessenta e três ~ $75,63
optional_cents_suffix = pynini.closure(
delete_extra_space
+ pynutil.insert('morphosyntactic_features: ","') # always use a comma in the decimal
+ insert_space
+ pynutil.insert('fractional_part: "')
+ pynini.closure((pynutil.delete("com") | pynutil.delete("e")) + delete_space, 0, 1)
+ pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
+ pynutil.insert('"'),
0,
1,
)
graph_integer = (
pynutil.insert('integer_part: "')
+ ((DAMO_SIGMA - "um" - "uma") @ cardinal_graph)
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit_plural
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_integer |= (
pynutil.insert('integer_part: "')
+ (pynini.cross("um", "1") | pynini.cross("uma", "1"))
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit_singular
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_cents_standalone = pynini.union(
pynutil.insert('currency: "R$" integer_part: "0" ') + cents_standalone,
pynutil.add_weight(
pynutil.insert('integer_part: "0" ')
+ cents_standalone
+ delete_extra_space
+ pynutil.delete("de")
+ delete_space
+ graph_unit_singular,
-0.1,
),
)
graph_decimal = (
graph_decimal_final
+ delete_extra_space
+ (pynutil.delete("de") + delete_space).ques
+ graph_unit_plural
)
graph_decimal |= graph_cents_standalone
final_graph = graph_integer | graph_decimal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import DAMO_SIGMA, GraphFst, delete_space
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinal
vigésimo primeiro -> ordinal { integer: "21" morphosyntactic_features: "o" }
This class converts ordinal up to "milésimo" (one thousandth) exclusive.
Cardinals below ten are not converted (in order to avoid
e.g. "primero fez ..." -> "1º fez...", "segunda guerra mundial" -> "2ª guerra mundial"
and any other odd conversions.)
This FST also records the ending of the ordinal (called "morphosyntactic_features"):
either "o" or "a".
Args:
cardinal: CardinalFst
"""
def __init__(self):
super().__init__(name="ordinal", kind="classify")
graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv"))
graph_hundreds = pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv"))
ordinal_graph_union = pynini.union(
pynutil.add_weight(graph_digit, 0.4),
pynutil.add_weight(
graph_ties + ((delete_space + graph_digit) | pynutil.insert("0")), 0.2
),
graph_hundreds
+ ((delete_space + graph_ties) | pynutil.insert("0"))
+ ((delete_space + graph_digit) | pynutil.insert("0")),
)
accept_o_endings = DAMO_SIGMA + pynini.accep("o")
accept_a_endings = DAMO_SIGMA + pynini.accep("a")
ordinal_graph_o = accept_o_endings @ ordinal_graph_union
ordinal_graph_a = accept_a_endings @ ordinal_graph_union
# 'optional_numbers_in_front' have negative weight so we always
# include them if they're there
optional_in_front = (
pynutil.add_weight(ordinal_graph_union, -0.1) + delete_space.closure()
).closure()
graph_o_suffix = optional_in_front + ordinal_graph_o
graph_a_suffix = optional_in_front + ordinal_graph_a
# don't convert ordinals from one to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit), "input")
graph_o_suffix = (
pynini.project(graph_o_suffix, "input") - graph_exception.arcsort()
) @ graph_o_suffix
graph_a_suffix = (
pynini.project(graph_a_suffix, "input") - graph_exception.arcsort()
) @ graph_a_suffix
graph = (
pynutil.insert('integer: "')
+ graph_o_suffix
+ pynutil.insert('"')
+ pynutil.insert(' morphosyntactic_features: "o"')
)
graph |= (
pynutil.insert('integer: "')
+ graph_a_suffix
+ pynutil.insert('"')
+ pynutil.insert(' morphosyntactic_features: "a"')
)
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def __init__(self):
super().__init__(name="punctuation", kind="classify")
s = "!#$%&'()*+,-./:;<=>?@^_`{|}~"
punct = pynini.union(*s)
graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
um dois um dois três quatro cinco seis sete oito nove -> { number_part: "(12) 12345-6789" }.
If 11 digits are spoken, they are grouped as 2+5+4 (eg. (12) 34567-8901).
If 10 digits are spoken, they are grouped as 2+4+4 (eg. (12) 3456-7890).
If 9 digits are spoken, they are grouped as 5+4 (eg. 12345-6789).
If 8 digits are spoken, they are grouped as 4+4 (eg. 1234-5678).
In portuguese, digits are generally spoken individually, or as 2-digit numbers,
eg. "trinta e quatro oitenta e dois" = "3482",
"meia sete vinte" = "6720".
"""
def __init__(self):
super().__init__(name="telephone", kind="classify")
# create `single_digits` and `double_digits` graphs as these will be
# the building blocks of possible telephone numbers
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_half = pynini.cross("meia", "6")
graph_all_digits = pynini.union(graph_digit, graph_half, graph_zero)
single_digits = pynini.invert(graph_all_digits).optimize()
double_digits = (
pynini.union(
graph_teen | graph_twenties,
(graph_ties + pynutil.insert("0")),
(graph_ties + delete_space + pynutil.delete("e") + delete_space + graph_digit),
(graph_all_digits + delete_space + graph_all_digits),
)
.invert()
.optimize()
)
# define `eleven_digit_graph`, `ten_digit_graph`, `nine_digit_graph`, `eight_digit_graph`
# which accept telephone numbers spoken (1) only with single digits,
# or (2) spoken with double digits (and sometimes single digits)
# 11-digit option (2): (2) + (1+2+2) + (2+2) digits
eleven_digit_graph = (
pynutil.delete("(")
+ double_digits
+ insert_space
+ pynutil.delete(") ")
+ single_digits
+ insert_space
+ double_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ double_digits
+ insert_space
+ double_digits
)
# 10-digit option (2): (2) + (2+2) + (2+2) digits
ten_digit_graph = (
pynutil.delete("(")
+ double_digits
+ insert_space
+ pynutil.delete(") ")
+ double_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ double_digits
+ insert_space
+ double_digits
)
# 9-digit option (2): (1+2+2) + (2+2) digits
nine_digit_graph = (
single_digits
+ insert_space
+ double_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ double_digits
+ insert_space
+ double_digits
)
# 8-digit option (2): (2+2) + (2+2) digits
eight_digit_graph = (
double_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ double_digits
+ insert_space
+ double_digits
)
number_part = pynini.union(
eleven_digit_graph, ten_digit_graph, nine_digit_graph, eight_digit_graph
)
number_part = (
pynutil.insert('number_part: "') + pynini.invert(number_part) + pynutil.insert('"')
)
graph = number_part
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment