Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
la una las 12
las dos la 1
las tres las 2
las cuatro las 3
las cinco las 4
las seis las 5
las siete las 6
las ocho las 7
las nueve las 8
las diez las 9
las once las 10
las doce las 11
las trece las 12
las catorce las 13
las quince las 14
las dieciséis las 15
las diecisiete las 16
las diecieocho las 17
las diecinueve las 18
las veinte las 19
las veintiuna las 20
las veintidos las 21
las veintitres las 22
las cero las 23
\ No newline at end of file
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_ALPHA,
DAMO_DIGIT,
DAMO_SIGMA,
DAMO_SPACE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"}
This class converts cardinals up to (but not including) "un cuatrillón",
i.e up to "one septillion" in English (10^{24}).
Cardinals below ten are not converted (in order to avoid
"vivo en una casa" --> "vivo en 1 casa" and any other odd conversions.)
Although technically Spanish grammar requires that "y" only comes after
"10s" numbers (ie. "treinta", ..., "noventa"), these rules will convert
numbers even with "y" in an ungrammatical place (because "y" is ignored
inside cardinal numbers).
e.g. "mil y una" -> cardinal { integer: "1001"}
e.g. "ciento y una" -> cardinal { integer: "101"}
"""
def __init__(self):
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv"))
graph_hundred_component = graph_hundreds | pynutil.insert("0")
graph_hundred_component += delete_space
graph_hundred_component += pynini.union(
graph_twenties | graph_teen | pynutil.insert("00"),
(graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
)
graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
)
self.graph_hundred_component_at_least_one_none_zero_digit = (
graph_hundred_component_at_least_one_none_zero_digit
)
graph_thousands = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("mil"),
pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil'
pynutil.insert("000", weight=0.1),
)
graph_millones = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("millones") | pynutil.delete("millón")),
pynutil.insert("000") + pynutil.delete("millones"), # to allow for 'mil millones'
)
graph_mil_millones = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("mil"),
pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil'
)
graph_mil_millones += delete_space + (
graph_millones | pynutil.insert("000") + pynutil.delete("millones")
) # allow for 'mil millones'
graph_mil_millones |= pynutil.insert("000000", weight=0.1)
# also allow 'millardo' instead of 'mil millones'
graph_millardo = (
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("millardo") | pynutil.delete("millardos"))
)
graph_billones = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("billones") | pynutil.delete("billón")),
)
graph_mil_billones = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("mil"),
pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil'
)
graph_mil_billones += delete_space + (
graph_billones | pynutil.insert("000") + pynutil.delete("billones")
) # allow for 'mil billones'
graph_mil_billones |= pynutil.insert("000000", weight=0.1)
graph_trillones = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ (pynutil.delete("trillones") | pynutil.delete("trillón")),
)
graph_mil_trillones = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("mil"),
pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil'
)
graph_mil_trillones += delete_space + (
graph_trillones | pynutil.insert("000") + pynutil.delete("trillones")
) # allow for 'mil trillones'
graph_mil_trillones |= pynutil.insert("000000", weight=0.1)
graph = pynini.union(
(graph_mil_trillones | pynutil.insert("000", weight=0.1) + graph_trillones)
+ delete_space
+ (graph_mil_billones | pynutil.insert("000", weight=0.1) + graph_billones)
+ delete_space
+ pynini.union(
graph_mil_millones,
pynutil.insert("000", weight=0.1) + graph_millones,
graph_millardo + graph_millones,
graph_millardo + pynutil.insert("000", weight=0.1),
)
+ delete_space
+ graph_thousands
+ delete_space
+ graph_hundred_component,
graph_zero,
)
graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT),
"0",
)
# ignore "y" inside cardinal numbers
graph = (
pynini.cdrewrite(pynutil.delete("y"), DAMO_SPACE, DAMO_SPACE, DAMO_SIGMA)
@ (DAMO_ALPHA + DAMO_SIGMA)
@ graph
)
self.graph_no_exception = graph
# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand = DAMO_DIGIT | (DAMO_DIGIT**2) | (DAMO_DIGIT**3)
numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize()
self.numbers_up_to_thousand = numbers_up_to_thousand
# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million = (
DAMO_DIGIT
| (DAMO_DIGIT**2)
| (DAMO_DIGIT**3)
| (DAMO_DIGIT**4)
| (DAMO_DIGIT**5)
| (DAMO_DIGIT**6)
)
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
self.numbers_up_to_million = numbers_up_to_million
# don't convert cardinals from zero to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("menos", '"-"') + DAMO_SPACE, 0, 1
)
final_graph = (
optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for classifying date,
e.g. primero de enero -> date { day: "1" month: "enero" }
e.g. uno de enero -> date { day: "1" month: "enero" }
"""
def __init__(self):
super().__init__(name="date", kind="classify")
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_1_to_100 = pynini.union(
graph_digit,
graph_twenties,
graph_teen,
(graph_ties + pynutil.insert("0")),
(graph_ties + pynutil.delete(" y ") + graph_digit),
)
digits_1_to_31 = [str(digits) for digits in range(1, 32)]
graph_1_to_31 = graph_1_to_100 @ pynini.union(*digits_1_to_31)
# can use "primero" for 1st day of the month
graph_1_to_31 = pynini.union(graph_1_to_31, pynini.cross("primero", "1"))
day_graph = pynutil.insert('day: "') + graph_1_to_31 + pynutil.insert('"')
month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
graph_dm = (
day_graph + delete_space + pynutil.delete("de") + delete_extra_space + month_graph
)
final_graph = graph_dm
final_graph += pynutil.insert(" preserve_order: true")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
def get_quantity(
decimal: "pynini.FstLike", cardinal_up_to_million: "pynini.FstLike"
) -> "pynini.FstLike":
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_million: cardinal FST
"""
numbers = cardinal_up_to_million @ (
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT)
)
suffix = pynini.union(
"millón",
"millones",
"millardo",
"millardos",
"billón",
"billones",
"trillón",
"trillones",
"cuatrillón",
"cuatrillones",
)
res = (
pynutil.insert('integer_part: "')
+ numbers
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('quantity: "')
+ suffix
+ pynutil.insert('"')
)
res |= (
decimal + delete_extra_space + pynutil.insert('quantity: "') + suffix + pynutil.insert('"')
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal
Decimal point is either "." or ",", determined by whether "punto" or "coma" is spoken.
e.g. menos uno coma dos seis -> decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" }
e.g. menos uno punto dos seis -> decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" }
This decimal rule assumes that decimals can be pronounced as:
(a cardinal) + ('coma' or 'punto') plus (any sequence of cardinals <1000, including 'zero')
Also writes large numbers in shortened form, e.g.
e.g. uno coma dos seis millón -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" }
e.g. dos millones -> decimal { negative: "false" integer_part: "2" quantity: "millones" }
e.g. mil ochocientos veinticuatro millones -> decimal { negative: "false" integer_part: "1824" quantity: "millones" }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="decimal", kind="classify")
# number after decimal point can be any series of cardinals <1000, including 'zero'
graph_decimal = cardinal.numbers_up_to_thousand
graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
self.graph = graph_decimal
# decimal point can be denoted by 'coma' or 'punto'
decimal_point = pynini.cross("coma", 'morphosyntactic_features: ","')
decimal_point |= pynini.cross("punto", 'morphosyntactic_features: "."')
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("menos", '"true"') + delete_extra_space,
0,
1,
)
graph_fractional = (
pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
)
cardinal_graph = cardinal.graph_no_exception | pynini.string_file(
get_abs_path("data/numbers/zero.tsv")
)
graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
final_graph_wo_sign = (
pynini.closure(graph_integer + delete_extra_space, 0, 1)
+ decimal_point
+ delete_extra_space
+ graph_fractional
)
final_graph = optional_graph_negative + final_graph_wo_sign
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
final_graph_wo_sign, cardinal.numbers_up_to_million
)
final_graph |= optional_graph_negative + get_quantity(
final_graph_wo_sign, cardinal.numbers_up_to_million
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import DAMO_ALPHA, GraphFst, insert_space
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying 'electronic' semiotic classes, i.e.
email address (which get converted to "username" and "domain" fields),
and URLS (which get converted to a "protocol" field).
e.g. c d f uno arroba a b c punto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
e.g. doble ve doble ve doble ve a b c punto e d u -> tokens { electronic { protocol: "www.abc.edu" } }
"""
def __init__(self):
super().__init__(name="electronic", kind="classify")
delete_extra_space = pynutil.delete(" ")
alpha_num = (
DAMO_ALPHA
| pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
)
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
accepted_username = alpha_num | symbols
process_dot = pynini.cross("punto", ".")
username = (
pynutil.insert('username: "')
+ alpha_num
+ delete_extra_space
+ pynini.closure(accepted_username + delete_extra_space)
+ alpha_num
+ pynutil.insert('"')
)
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
server = (
single_alphanum
| pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert()
)
domain = (
single_alphanum
| pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert()
)
domain_graph = (
pynutil.insert('domain: "')
+ server
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ domain
+ pynutil.insert('"')
)
graph = (
username
+ delete_extra_space
+ pynutil.delete("arroba")
+ insert_space
+ delete_extra_space
+ domain_graph
)
############# url ###
protocol_end = pynini.cross(
pynini.union("www", "w w w", "doble ve doble ve doble ve"), "www"
)
protocol_start = pynini.cross(pynini.union("http", "h t t p", "hache te te pe"), "http")
protocol_start |= pynini.cross(
pynini.union("https", "h t t p s", "hache te te pe ese"), "https"
)
protocol_start += pynini.cross(" dos puntos barra barra ", "://")
# e.g. .com, .es
ending = (
delete_extra_space
+ symbols
+ delete_extra_space
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)
protocol = (
pynini.closure(protocol_start, 0, 1)
+ protocol_end
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+ pynini.closure(ending, 1)
)
protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"')
graph |= protocol
########
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure
e.g. menos doce kilogramos -> measure { cardinal { negative: "true" integer: "12" } units: "kg" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_unit_singular = pynini.string_file(get_abs_path("data/measurements_singular.tsv"))
graph_unit_singular = pynini.invert(graph_unit_singular) # singular -> abbr
graph_unit_plural = pynini.string_file(get_abs_path("data/measurements_plural.tsv"))
graph_unit_plural = pynini.invert(graph_unit_plural) # plural -> abbr
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("menos", '"true"') + delete_extra_space,
0,
1,
)
unit_singular = convert_space(graph_unit_singular)
unit_plural = convert_space(graph_unit_plural)
unit_misc = (
pynutil.insert("/")
+ pynutil.delete("por")
+ delete_space
+ convert_space(graph_unit_singular)
)
unit_singular = (
pynutil.insert('units: "')
+ (
unit_singular
| unit_misc
| pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)
)
+ pynutil.insert('"')
)
unit_plural = (
pynutil.insert('units: "')
+ (
unit_plural
| unit_misc
| pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)
)
+ pynutil.insert('"')
)
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal.final_graph_wo_negative
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_plural
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ ((DAMO_SIGMA - "un" - "una" - "uno") @ cardinal_graph)
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_plural
)
subgraph_cardinal |= (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ (pynini.cross("un", "1") | pynini.cross("una", "1") | pynini.cross("uno", "1"))
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_singular
)
final_graph = subgraph_decimal | subgraph_cardinal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money
e.g. doce dólares y cinco céntimos -> money { integer_part: "12" fractional_part: 05 currency: "$" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="money", kind="classify")
# quantity, integer_part, fractional_part, currency
cardinal_graph = cardinal.graph_no_exception
graph_decimal_final = decimal.final_graph_wo_negative
unit_singular = pynini.string_file(get_abs_path("data/currency_singular.tsv"))
unit_singular = pynini.invert(unit_singular)
unit_plural = pynini.string_file(get_abs_path("data/currency_plural.tsv"))
unit_plural = pynini.invert(unit_plural)
graph_unit_singular = (
pynutil.insert('currency: "') + convert_space(unit_singular) + pynutil.insert('"')
)
graph_unit_plural = (
pynutil.insert('currency: "') + convert_space(unit_plural) + pynutil.insert('"')
)
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
# twelve dollars (and) fifty cents, zero cents
cents_standalone = (
pynutil.insert('morphosyntactic_features: ","') # always use a comma in the decimal
+ insert_space
+ pynutil.insert('fractional_part: "')
+ pynini.union(
pynutil.add_weight(((DAMO_SIGMA - "un") @ cardinal_graph), -0.7)
@ add_leading_zero_to_double_digit
+ delete_space
+ pynutil.delete(pynini.union("centavos", "céntimos")),
pynini.cross("un", "01")
+ delete_space
+ pynutil.delete(pynini.union("centavo", "céntimo")),
)
+ pynutil.insert('"')
)
optional_cents_standalone = pynini.closure(
delete_space
+ pynini.closure((pynutil.delete("con") | pynutil.delete("y")) + delete_space, 0, 1)
+ insert_space
+ cents_standalone,
0,
1,
)
# twelve dollars fifty, only after integer
# setenta y cinco dólares con sesenta y tres~$75,63
optional_cents_suffix = pynini.closure(
delete_extra_space
+ pynutil.insert('morphosyntactic_features: ","') # always use a comma in the decimal
+ insert_space
+ pynutil.insert('fractional_part: "')
+ pynini.closure((pynutil.delete("con") | pynutil.delete("y")) + delete_space, 0, 1)
+ pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
+ pynutil.insert('"'),
0,
1,
)
graph_integer = (
pynutil.insert('integer_part: "')
+ ((DAMO_SIGMA - "un" - "una") @ cardinal_graph)
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit_plural
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_integer |= (
pynutil.insert('integer_part: "')
+ (pynini.cross("un", "1") | pynini.cross("una", "1"))
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit_singular
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
graph_decimal |= pynutil.insert('currency: "$" integer_part: "0" ') + cents_standalone
final_graph = graph_integer | graph_decimal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import DAMO_SIGMA, GraphFst, delete_space
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinal
vigésimo primero -> ordinal { integer: "21" morphosyntactic_features: "o" }
This class converts ordinal up to "millesímo" (one thousandth) exclusive.
Cardinals below ten are not converted (in order to avoid
e.g. "primero hice ..." -> "1.º hice...", "segunda guerra mundial" -> "2.ª guerra mundial"
and any other odd conversions.)
This FST also records the ending of the ordinal (called "morphosyntactic_features"):
either "o", "a", or "er".
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="ordinal", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/ordinals/twenties.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv"))
graph_hundreds = pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv"))
ordinal_graph_union = pynini.union(
graph_digit,
graph_teens,
graph_twenties,
graph_ties,
graph_hundreds,
)
accept_o_endings = DAMO_SIGMA + pynini.accep("o")
accept_a_endings = DAMO_SIGMA + pynini.accep("a")
accept_er_endings = DAMO_SIGMA.closure() + pynini.accep("er")
ordinal_graph_o = accept_o_endings @ ordinal_graph_union
ordinal_graph_a = accept_a_endings @ ordinal_graph_union
ordinal_graph_er = accept_er_endings @ ordinal_graph_union
# 'optional_numbers_in_front' have negative weight so we always
# include them if they're there
optional_numbers_in_front = (
pynutil.add_weight(ordinal_graph_union, -0.1) + delete_space.closure()
).closure()
graph_o_suffix = (optional_numbers_in_front + ordinal_graph_o) @ cardinal_graph
graph_a_suffix = (optional_numbers_in_front + ordinal_graph_a) @ cardinal_graph
graph_er_suffix = (optional_numbers_in_front + ordinal_graph_er) @ cardinal_graph
# don't convert ordinals from one to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit), "input")
graph_o_suffix = (
pynini.project(graph_o_suffix, "input") - graph_exception.arcsort()
) @ graph_o_suffix
graph_a_suffix = (
pynini.project(graph_a_suffix, "input") - graph_exception.arcsort()
) @ graph_a_suffix
graph_er_suffix = (
pynini.project(graph_er_suffix, "input") - graph_exception.arcsort()
) @ graph_er_suffix
graph = (
pynutil.insert('integer: "')
+ graph_o_suffix
+ pynutil.insert('"')
+ pynutil.insert(' morphosyntactic_features: "o"')
)
graph |= (
pynutil.insert('integer: "')
+ graph_a_suffix
+ pynutil.insert('"')
+ pynutil.insert(' morphosyntactic_features: "a"')
)
graph |= (
pynutil.insert('integer: "')
+ graph_er_suffix
+ pynutil.insert('"')
+ pynutil.insert(' morphosyntactic_features: "er"')
)
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def __init__(self):
super().__init__(name="punctuation", kind="classify")
s = "!#$%&'()*+,-./:;<=>?@^_`{|}~"
punct = pynini.union(*s)
graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
uno dos tres uno dos tres cinco seis siete ocho -> { number_part: "123-123-5678" }.
If 10 digits are spoken, they are grouped as 3+3+4 (eg. 123-456-7890).
If 9 digits are spoken, they are grouped as 3+3+3 (eg. 123-456-789).
If 8 digits are spoken, they are grouped as 4+4 (eg. 1234-5678).
In Spanish, digits are generally spoken individually, or as 2-digit numbers,
eg. "one twenty three" = "123",
"twelve thirty four" = "1234".
(we ignore more complicated cases such as "three hundred and two" or "three nines").
"""
def __init__(self):
super().__init__(name="telephone", kind="classify")
# create `single_digits` and `double_digits` graphs as these will be
# the building blocks of possible telephone numbers
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
single_digits = pynini.invert(graph_digit).optimize() | pynini.cross("0", "cero")
double_digits = pynini.union(
graph_twenties,
graph_teen,
(graph_ties + pynutil.insert("0")),
(graph_ties + delete_space + pynutil.delete("y") + delete_space + graph_digit),
).invert()
# define `ten_digit_graph`, `nine_digit_graph`, `eight_digit_graph`
# which accept telephone numbers spoken (1) only with single digits,
# or (2) spoken with double digits (and sometimes single digits)
# 10-digit option (1): all single digits
ten_digit_graph = (
pynini.closure(single_digits + insert_space, 3, 3)
+ pynutil.delete("-")
+ pynini.closure(single_digits + insert_space, 3, 3)
+ pynutil.delete("-")
+ pynini.closure(single_digits + insert_space, 3, 3)
+ single_digits
)
# 10-digit option (2): (1+2) + (1+2) + (2+2) digits
ten_digit_graph |= (
single_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ single_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ double_digits
+ insert_space
+ double_digits
)
# 9-digit option (1): all single digits
nine_digit_graph = (
pynini.closure(single_digits + insert_space, 3, 3)
+ pynutil.delete("-")
+ pynini.closure(single_digits + insert_space, 3, 3)
+ pynutil.delete("-")
+ pynini.closure(single_digits + insert_space, 2, 2)
+ single_digits
)
# 9-digit option (2): (1+2) + (1+2) + (1+2) digits
nine_digit_graph |= (
single_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ single_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ single_digits
+ insert_space
+ double_digits
)
# 8-digit option (1): all single digits
eight_digit_graph = (
pynini.closure(single_digits + insert_space, 4, 4)
+ pynutil.delete("-")
+ pynini.closure(single_digits + insert_space, 3, 3)
+ single_digits
)
# 8-digit option (2): (2+2) + (2+2) digits
eight_digit_graph |= (
double_digits
+ insert_space
+ double_digits
+ insert_space
+ pynutil.delete("-")
+ double_digits
+ insert_space
+ double_digits
)
number_part = pynini.union(
ten_digit_graph,
nine_digit_graph,
eight_digit_graph,
)
number_part = (
pynutil.insert('number_part: "') + pynini.invert(number_part) + pynutil.insert('"')
)
graph = number_part
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time
Time formats that it converts:
- <hour> + <minutes>
e.g. la una diez -> time { hours: "la 1" minutes: "10" }
- <hour> + " y " + <minutes>
e.g. la una y diez -> time { hours: "la 1" minutes: "10" }
- <hour> + " con " + <minutes>
e.g. la una con diez -> time { hours: "la 1" minutes: "10" }
- <hour> + " menos " + <minutes>
e.g. las dos menos cuarto -> time { hours: "la 1" minutes: "45" }
- "(un) cuarto para " + <hour>
e.g. cuarto para las dos -> time { minutes: "45" hours: "la 1" }
Note that times on the hour (e.g. "las dos" i.e. "two o'clock") do not get
converted into a time format. This is to avoid converting phrases that are
not part of a time phrase (e.g. "las dos personas" i.e. "the two people")
e.g. las dos -> tokens { name: "las" } tokens { name: "dos" }
However, if a time on the hour is followed by a suffix (indicating 'a.m.'
or 'p.m.'), it will be converted.
e.g. las dos pe eme -> time { hours: "las 2" minutes: "00" suffix: "p.m." }
Note that although the TimeFst verbalizer can accept 'zone' (timezone) fields,
so far the rules have not been added to the TimeFst tagger to process
timezones (to keep the rules simple, and because timezones are not very
often specified in Spanish.)
"""
def __init__(self):
super().__init__(name="time", kind="classify")
suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))
time_to_graph = pynini.string_file(get_abs_path("data/time/time_to.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_1_to_100 = pynini.union(
graph_digit,
graph_twenties,
graph_teen,
(graph_ties + pynutil.insert("0")),
(graph_ties + pynutil.delete(" y ") + graph_digit),
)
# note that graph_hour will start from 2 hours
# "1 o'clock" will be treated differently because it
# is singular
digits_2_to_23 = [str(digits) for digits in range(2, 24)]
digits_1_to_59 = [str(digits) for digits in range(1, 60)]
graph_1oclock = pynini.cross("la una", "la 1")
graph_hour = pynini.cross("las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23)
graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59)
graph_minute_verbose = pynini.cross("media", "30") | pynini.cross("cuarto", "15")
final_graph_hour = (
pynutil.insert('hours: "') + (graph_1oclock | graph_hour) + pynutil.insert('"')
)
final_graph_minute = (
pynutil.insert('minutes: "')
+ pynini.closure((pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0, 1)
+ (graph_minute | graph_minute_verbose)
+ pynutil.insert('"')
)
final_suffix = (
pynutil.insert('suffix: "') + convert_space(suffix_graph) + pynutil.insert('"')
)
final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1)
# las nueve a eme (only convert on-the-hour times if they are followed by a suffix)
graph_hsuffix = (
final_graph_hour
+ delete_extra_space
+ pynutil.insert('minutes: "00"')
+ insert_space
+ final_suffix
)
# las nueve y veinticinco
graph_hm = final_graph_hour + delete_extra_space + final_graph_minute
# un cuarto para las cinco
graph_mh = (
pynutil.insert('minutes: "')
+ pynini.union(
pynini.cross("un cuarto para", "45"),
pynini.cross("cuarto para", "45"),
)
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('hours: "')
+ time_to_graph
+ pynutil.insert('"')
)
# las diez menos diez
graph_time_to = (
pynutil.insert('hours: "')
+ time_to_graph
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('minutes: "')
+ delete_space
+ pynutil.delete("menos")
+ delete_space
+ pynini.union(
pynini.cross("cinco", "55"),
pynini.cross("diez", "50"),
pynini.cross("cuarto", "45"),
pynini.cross("veinte", "40"),
pynini.cross("veinticinco", "30"),
)
+ pynutil.insert('"')
)
final_graph = pynini.union(
(graph_hm | graph_mh | graph_time_to) + final_suffix_optional, graph_hsuffix
).optimize()
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import os
import pynini
from fun_text_processing.inverse_text_normalization.es.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.es.taggers.date import DateFst
from fun_text_processing.inverse_text_normalization.es.taggers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.es.taggers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.es.taggers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.es.taggers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.es.taggers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.es.taggers.punctuation import PunctuationFst
from fun_text_processing.inverse_text_normalization.es.taggers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.es.taggers.time import TimeFst
from fun_text_processing.inverse_text_normalization.es.taggers.whitelist import WhiteListFst
from fun_text_processing.inverse_text_normalization.es.taggers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from pynini.lib import pynutil
import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
super().__init__(name="tokenize_and_classify", kind="classify")
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, "_es_itn.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars.")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(cardinal)
ordinal_graph = ordinal.fst
decimal = DecimalFst(cardinal)
decimal_graph = decimal.fst
measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
date_graph = DateFst().fst
word_graph = WordFst().fst
time_graph = TimeFst().fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
whitelist_graph = WhiteListFst().fst
punct_graph = PunctuationFst().fst
electronic_graph = ElectronicFst().fst
telephone_graph = TelephoneFst().fst
classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.09)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
punct = (
pynutil.insert("tokens { ")
+ pynutil.add_weight(punct_graph, weight=1.1)
+ pynutil.insert(" }")
)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ token
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
graph = delete_space + graph + delete_space
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
import pynini
from fun_text_processing.inverse_text_normalization.es.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelisted tokens
e.g. usted -> tokens { name: "ud." }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""
def __init__(self):
super().__init__(name="whitelist", kind="classify")
whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_SPACE, GraphFst
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
e.g. sleep -> tokens { name: "sleep" }
"""
def __init__(self):
super().__init__(name="word", kind="classify")
word = pynutil.insert('name: "') + pynini.closure(DAMO_NOT_SPACE, 1) + pynutil.insert('"')
self.fst = word.optimize()
import os
def get_abs_path(rel_path):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { negative: "-" integer: "23" } -> -23
"""
def __init__(self):
super().__init__(name="cardinal", kind="verbalize")
optional_sign = pynini.closure(
pynutil.delete("negative:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space,
0,
1,
)
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
self.numbers = graph
graph = optional_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "1" month: "enero" preserve_order: true } -> 1 de enero
"""
def __init__(self):
super().__init__(name="date", kind="verbalize")
month = (
pynutil.delete("month:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
day = (
pynutil.delete("day:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
# day month
graph_dm = day + delete_extra_space + pynutil.insert("de") + insert_space + month
optional_preserve_order = pynini.closure(
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
| pynutil.delete("field_order:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space
)
final_graph = graph_dm + delete_space + optional_preserve_order
delete_tokens = self.delete_tokens(final_graph)
self.fst = delete_tokens.optimize()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment