Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
cst c s t
cet c e t
pst p s t
est e s t
pt p t
et e t
gmt g m t
one 12
two 1
three 2
four 3
five 4
six 5
seven 6
eigh 7
nine 8
ten 9
eleven 10
twelve 11
\ No newline at end of file
e.g. for example
dr. doctor
mr. mister
mrs. misses
st. saint
7-eleven seven eleven
es3 e s three
s&p s and p
ASAP a s a p
AT&T a t and t
LLP l l p
ATM a t m
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_ALPHA,
DAMO_DIGIT,
DAMO_SIGMA,
DAMO_SPACE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
Numbers below thirteen are not converted.
"""
def __init__(self):
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_hundred = pynini.cross("hundred", "")
graph_hundred_component = pynini.union(
graph_digit + delete_space + graph_hundred, pynutil.insert("0")
)
graph_hundred_component += delete_space
graph_hundred_component += pynini.union(
graph_teen | pynutil.insert("00"),
(graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
)
graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
)
self.graph_hundred_component_at_least_one_none_zero_digit = (
graph_hundred_component_at_least_one_none_zero_digit
)
graph_thousands = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("thousand"),
pynutil.insert("000", weight=0.1),
)
graph_million = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("million"),
pynutil.insert("000", weight=0.1),
)
graph_billion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("billion"),
pynutil.insert("000", weight=0.1),
)
graph_trillion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("trillion"),
pynutil.insert("000", weight=0.1),
)
graph_quadrillion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("quadrillion"),
pynutil.insert("000", weight=0.1),
)
graph_quintillion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("quintillion"),
pynutil.insert("000", weight=0.1),
)
graph_sextillion = pynini.union(
graph_hundred_component_at_least_one_none_zero_digit
+ delete_space
+ pynutil.delete("sextillion"),
pynutil.insert("000", weight=0.1),
)
graph = pynini.union(
graph_sextillion
+ delete_space
+ graph_quintillion
+ delete_space
+ graph_quadrillion
+ delete_space
+ graph_trillion
+ delete_space
+ graph_billion
+ delete_space
+ graph_million
+ delete_space
+ graph_thousands
+ delete_space
+ graph_hundred_component,
graph_zero,
)
graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT),
"0",
)
labels_exception = [num_to_word(x) for x in range(0, 13)]
graph_exception = pynini.union(*labels_exception)
graph = (
pynini.cdrewrite(pynutil.delete("and"), DAMO_SPACE, DAMO_SPACE, DAMO_SIGMA)
@ (DAMO_ALPHA + DAMO_SIGMA)
@ graph
)
self.graph_no_exception = graph
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("minus", '"-"') + DAMO_SPACE, 0, 1
)
final_graph = (
optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_ALPHA,
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize()
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize()
def _get_month_graph():
"""
Transducer for month, e.g. march -> march
"""
month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
return month_graph
def _get_ties_graph():
"""
Transducer for 20-99 e.g
twenty three -> 23
"""
graph = ties_graph + (delete_space + graph_digit | pynutil.insert("0"))
return graph
def _get_range_graph():
"""
Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
"""
graph_ties = _get_ties_graph()
graph = (graph_ties | graph_teen) + delete_space + pynini.cross("hundreds", "00s")
graph |= pynini.cross("two", "2") + delete_space + pynini.cross("thousands", "000s")
graph |= (
(graph_ties | graph_teen)
+ delete_space
+ (pynini.closure(DAMO_ALPHA, 1) + (pynini.cross("ies", "y") | pynutil.delete("s")))
@ (graph_ties | pynini.cross("ten", "10"))
+ pynutil.insert("s")
)
graph @= pynini.union("1", "2") + DAMO_DIGIT + DAMO_DIGIT + DAMO_DIGIT + "s"
return graph
def _get_year_graph():
"""
Transducer for year, e.g. twenty twenty -> 2020
"""
def _get_digits_graph():
zero = pynini.cross((pynini.accep("oh") | pynini.accep("o")), "0")
graph = zero + delete_space + graph_digit
graph.optimize()
return graph
def _get_thousands_graph():
graph_ties = _get_ties_graph()
graph_hundred_component = (
graph_digit + delete_space + pynutil.delete("hundred")
) | pynutil.insert("0")
graph = (
graph_digit
+ delete_space
+ pynutil.delete("thousand")
+ delete_space
+ graph_hundred_component
+ delete_space
+ (graph_teen | graph_ties)
)
return graph
graph_ties = _get_ties_graph()
graph_digits = _get_digits_graph()
graph_thousands = _get_thousands_graph()
year_graph = (
# 20 19, 40 12, 2012 - assuming no limit on the year
(graph_teen + delete_space + (graph_ties | graph_digits | graph_teen))
| (graph_ties + delete_space + (graph_ties | graph_digits | graph_teen))
| graph_thousands
)
year_graph.optimize()
return year_graph
class DateFst(GraphFst):
"""
Finite state transducer for classifying date,
e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
e.g. twenty twenty -> date { year: "2012" preserve_order: true }
Args:
ordinal: OrdinalFst
"""
def __init__(self, ordinal: GraphFst):
super().__init__(name="date", kind="classify")
ordinal_graph = ordinal.graph
year_graph = _get_year_graph()
YEAR_WEIGHT = 0.001
year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
month_graph = _get_month_graph()
month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
day_graph = (
pynutil.insert('day: "') + pynutil.add_weight(ordinal_graph, -0.7) + pynutil.insert('"')
)
graph_year = (
delete_extra_space
+ pynutil.insert('year: "')
+ pynutil.add_weight(year_graph, -YEAR_WEIGHT)
+ pynutil.insert('"')
)
optional_graph_year = pynini.closure(
graph_year,
0,
1,
)
graph_mdy = month_graph + (
(delete_extra_space + day_graph)
| graph_year
| (delete_extra_space + day_graph + graph_year)
)
graph_dmy = (
pynutil.delete("the")
+ delete_space
+ day_graph
+ delete_space
+ pynutil.delete("of")
+ delete_extra_space
+ month_graph
+ optional_graph_year
)
graph_year = (
pynutil.insert('year: "') + (year_graph | _get_range_graph()) + pynutil.insert('"')
)
final_graph = graph_mdy | graph_dmy | graph_year
final_graph += pynutil.insert(" preserve_order: true")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
def get_quantity(
decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike"
) -> "pynini.FstLike":
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
numbers = cardinal_up_to_hundred @ (
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT)
)
suffix = pynini.union(
"million", "billion", "trillion", "quadrillion", "quintillion", "sextillion"
)
res = (
pynutil.insert('integer_part: "')
+ numbers
+ pynutil.insert('"')
+ delete_extra_space
+ pynutil.insert('quantity: "')
+ suffix
+ pynutil.insert('"')
)
res |= (
decimal
+ delete_extra_space
+ pynutil.insert('quantity: "')
+ (suffix | "thousand")
+ pynutil.insert('"')
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal
e.g. minus twelve point five o o six billion -> decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" }
e.g. one billion -> decimal { integer_part: "1" quantity: "billion" }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="decimal", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) | pynini.cross(
"o", "0"
)
graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
self.graph = graph_decimal
point = pynutil.delete("point")
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("minus", '"true"') + delete_extra_space,
0,
1,
)
graph_fractional = (
pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
)
graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
final_graph_wo_sign = (
pynini.closure(graph_integer + delete_extra_space, 0, 1)
+ point
+ delete_extra_space
+ graph_fractional
)
final_graph = optional_graph_negative + final_graph_wo_sign
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
final_graph |= optional_graph_negative + get_quantity(
final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import DAMO_ALPHA, GraphFst, insert_space
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. c d f one at a b c dot e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
"""
def __init__(self):
super().__init__(name="electronic", kind="classify")
delete_extra_space = pynutil.delete(" ")
alpha_num = (
DAMO_ALPHA
| pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
)
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
accepted_username = alpha_num | symbols
process_dot = pynini.cross("dot", ".")
username = (
alpha_num + pynini.closure(delete_extra_space + accepted_username)
) | pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
username = pynutil.insert('username: "') + username + pynutil.insert('"')
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
server = single_alphanum | pynini.string_file(
get_abs_path("data/electronic/server_name.tsv")
)
domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
domain_graph = (
pynutil.insert('domain: "')
+ server
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ domain
+ pynutil.insert('"')
)
graph = (
username
+ delete_extra_space
+ pynutil.delete("at")
+ insert_space
+ delete_extra_space
+ domain_graph
)
############# url ###
protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
protocol_start = (
pynini.cross("h t t p", "http") | pynini.cross("h t t p s", "https")
) + pynini.cross(" colon slash slash ", "://")
# .com,
ending = (
delete_extra_space
+ symbols
+ delete_extra_space
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)
protocol_default = (
(
(pynini.closure(delete_extra_space + accepted_username, 1) | server)
| pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
)
+ pynini.closure(ending, 1)
).optimize()
protocol = (
pynini.closure(protocol_start, 0, 1)
+ protocol_end
+ delete_extra_space
+ process_dot
+ protocol_default
).optimize()
protocol |= (
pynini.closure(protocol_end + delete_extra_space + process_dot, 0, 1) + protocol_default
)
protocol = pynutil.insert('protocol: "') + protocol.optimize() + pynutil.insert('"')
graph |= protocol
########
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
class FractionFst(GraphFst):
"""
Finite state transducer for classifying fraction
"""
def __init__(self):
super().__init__(name="fraction", kind="classify")
# integer_part # numerator # denominator
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
get_singulars,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure
e.g. minus twelve kilograms -> measure { negative: "true" cardinal { integer: "12" } units: "kg" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr
graph_unit_plural = get_singulars(graph_unit_singular) # plural -> abbr
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("minus", '"true"') + delete_extra_space,
0,
1,
)
unit_singular = convert_space(graph_unit_singular)
unit_plural = convert_space(graph_unit_plural)
unit_misc = (
pynutil.insert("/")
+ pynutil.delete("per")
+ delete_space
+ convert_space(graph_unit_singular)
)
unit_singular = (
pynutil.insert('units: "')
+ (
unit_singular
| unit_misc
| pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)
)
+ pynutil.insert('"')
)
unit_plural = (
pynutil.insert('units: "')
+ (
unit_plural
| unit_misc
| pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)
)
+ pynutil.insert('"')
)
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal.final_graph_wo_negative
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_plural
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ ((DAMO_SIGMA - "one") @ cardinal_graph)
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_plural
)
subgraph_cardinal |= (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ pynini.cross("one", "1")
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit_singular
)
final_graph = subgraph_decimal | subgraph_cardinal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
DAMO_NOT_SPACE,
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
get_singulars,
insert_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money
e.g. twelve dollars and five cents -> money { integer_part: "12" fractional_part: 05 currency: "$" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="money", kind="classify")
# quantity, integer_part, fractional_part, currency
cardinal_graph = cardinal.graph_no_exception
# add support for missing hundred (only for 3 digit numbers)
# "one fifty" -> "one hundred fifty"
with_hundred = pynini.compose(
pynini.closure(DAMO_NOT_SPACE)
+ pynini.accep(" ")
+ pynutil.insert("hundred ")
+ DAMO_SIGMA,
pynini.compose(cardinal_graph, DAMO_DIGIT**3),
)
cardinal_graph |= with_hundred
graph_decimal_final = decimal.final_graph_wo_negative
unit = pynini.string_file(get_abs_path("data/currency.tsv"))
unit_singular = pynini.invert(unit)
unit_plural = get_singulars(unit_singular)
graph_unit_singular = (
pynutil.insert('currency: "') + convert_space(unit_singular) + pynutil.insert('"')
)
graph_unit_plural = (
pynutil.insert('currency: "') + convert_space(unit_plural) + pynutil.insert('"')
)
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
# twelve dollars (and) fifty cents, zero cents
cents_standalone = (
pynutil.insert('fractional_part: "')
+ pynini.union(
pynutil.add_weight(((DAMO_SIGMA - "one") @ cardinal_graph), -0.7)
@ add_leading_zero_to_double_digit
+ delete_space
+ (pynutil.delete("cents") | pynutil.delete("cent")),
pynini.cross("one", "01") + delete_space + pynutil.delete("cent"),
)
+ pynutil.insert('"')
)
optional_cents_standalone = pynini.closure(
delete_space
+ pynini.closure(pynutil.delete("and") + delete_space, 0, 1)
+ insert_space
+ cents_standalone,
0,
1,
)
# twelve dollars fifty, only after integer
optional_cents_suffix = pynini.closure(
delete_extra_space
+ pynutil.insert('fractional_part: "')
+ pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
+ pynutil.insert('"'),
0,
1,
)
graph_integer = (
pynutil.insert('integer_part: "')
+ ((DAMO_SIGMA - "one") @ cardinal_graph)
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit_plural
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_integer |= (
pynutil.insert('integer_part: "')
+ pynini.cross("one", "1")
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit_singular
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
graph_decimal |= pynutil.insert('currency: "$" integer_part: "0" ') + cents_standalone
final_graph = graph_integer | graph_decimal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinal
e.g. thirteenth -> ordinal { integer: "13" }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="ordinal", kind="classify")
cardinal_graph = cardinal.graph_no_exception
graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
graph = pynini.closure(DAMO_CHAR) + pynini.union(
graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "")
)
self.graph = graph @ cardinal_graph
final_graph = pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class PunctuationFst(GraphFst):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def __init__(self):
super().__init__(name="punctuation", kind="classify")
s = "!#$%&'()*+,-./:;<=>?@^_`{|}~"
punct = pynini.union(*s)
graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_ALNUM,
DAMO_ALPHA,
DAMO_DIGIT,
GraphFst,
insert_space,
)
from pynini.lib import pynutil
def get_serial_number(cardinal):
"""
any alphanumerical character sequence with at least one number with length greater equal to 3
"""
digit = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT)
character = digit | DAMO_ALPHA
sequence = character + pynini.closure(pynutil.delete(" ") + character, 2)
sequence = sequence @ (pynini.closure(DAMO_ALNUM) + DAMO_DIGIT + pynini.closure(DAMO_ALNUM))
return sequence.optimize()
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
one two three one two three five six seven eight -> { number_part: "123-123-5678" }
This class also support card number and IP format.
"one two three dot one double three dot o dot four o" -> { number_part: "123.133.0.40"}
"three two double seven three two one four three two one four three double zero five" ->
{ number_part: 3277 3214 3214 3005}
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="telephone", kind="classify")
# country code, number_part, extension
digit_to_str = (
pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize())
| pynini.cross("0", pynini.union("o", "oh", "zero")).optimize()
)
str_to_digit = pynini.invert(digit_to_str)
double_digit = pynini.union(
*[
pynini.cross(
pynini.project(str(i) @ digit_to_str, "output")
+ pynini.accep(" ")
+ pynini.project(str(i) @ digit_to_str, "output"),
pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"),
)
for i in range(10)
]
)
double_digit.invert()
# to handle cases like "one twenty three"
two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT**2)
double_digit_to_digit = (
pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit)
| two_digit_cardinal
)
single_or_double_digit = (
pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit
).optimize()
single_or_double_digit |= (
single_or_double_digit
+ pynini.closure(
pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001)
)
).optimize()
number_part = pynini.compose(
single_or_double_digit,
DAMO_DIGIT**3
+ pynutil.insert("-")
+ DAMO_DIGIT**3
+ pynutil.insert("-")
+ DAMO_DIGIT**4,
).optimize()
number_part = (
pynutil.insert('number_part: "') + number_part.optimize() + pynutil.insert('"')
)
cardinal_option = pynini.compose(single_or_double_digit, DAMO_DIGIT ** (2, 3))
country_code = (
pynutil.insert('country_code: "')
+ pynini.closure(pynini.cross("plus ", "+"), 0, 1)
+ (
(pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit)
| cardinal_option
)
+ pynutil.insert('"')
)
optional_country_code = pynini.closure(
country_code + pynutil.delete(" ") + insert_space, 0, 1
).optimize()
graph = optional_country_code + number_part
# credit card number
space_four_digits = insert_space + DAMO_DIGIT**4
credit_card_graph = pynini.compose(
single_or_double_digit, DAMO_DIGIT**4 + space_four_digits**3
).optimize()
graph |= (
pynutil.insert('number_part: "') + credit_card_graph.optimize() + pynutil.insert('"')
)
# SSN
ssn_graph = pynini.compose(
single_or_double_digit,
DAMO_DIGIT**3
+ pynutil.insert("-")
+ DAMO_DIGIT**2
+ pynutil.insert("-")
+ DAMO_DIGIT**4,
).optimize()
graph |= pynutil.insert('number_part: "') + ssn_graph.optimize() + pynutil.insert('"')
# ip
digit_or_double = (
pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit
)
digit_or_double |= double_digit_to_digit + pynini.closure(
pynutil.delete(" ") + str_to_digit, 0, 1
)
digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit) ** (0, 2)
digit_or_double |= cardinal_option
digit_or_double = digit_or_double.optimize()
ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3
graph |= pynutil.insert('number_part: "') + ip_graph.optimize() + pynutil.insert('"')
graph |= (
pynutil.insert('number_part: "')
+ pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001)
+ pynutil.insert('"')
)
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time
e.g. twelve thirty -> time { hours: "12" minutes: "30" }
e.g. twelve past one -> time { minutes: "12" hours: "1" }
e.g. two o clock a m -> time { hours: "2" suffix: "a.m." }
e.g. quarter to two -> time { hours: "1" minutes: "45" }
e.g. quarter past two -> time { hours: "2" minutes: "15" }
e.g. half past two -> time { hours: "2" minutes: "30" }
"""
def __init__(self):
super().__init__(name="time", kind="classify")
# hours, minutes, seconds, suffix, zone, style, speak_period
suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))
time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv"))
minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv"))
# only used for < 1000 thousand -> 0 weight
cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7)
labels_hour = [num_to_word(x) for x in range(0, 24)]
labels_minute_single = [num_to_word(x) for x in range(1, 10)]
labels_minute_double = [num_to_word(x) for x in range(10, 60)]
graph_hour = pynini.union(*labels_hour) @ cardinal
graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15")
oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")
final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"')
graph_minute = (
oclock + pynutil.insert("00")
| pynutil.delete("o") + delete_space + graph_minute_single
| graph_minute_double
)
final_suffix = (
pynutil.insert('suffix: "') + convert_space(suffix_graph) + pynutil.insert('"')
)
final_suffix = delete_space + insert_space + final_suffix
final_suffix_optional = pynini.closure(final_suffix, 0, 1)
final_time_zone_optional = pynini.closure(
delete_space
+ insert_space
+ pynutil.insert('zone: "')
+ convert_space(time_zone_graph)
+ pynutil.insert('"'),
0,
1,
)
# five o' clock
# two o eight, two thirty five (am/pm)
# two pm/am
graph_hm = (
final_graph_hour
+ delete_extra_space
+ pynutil.insert('minutes: "')
+ graph_minute
+ pynutil.insert('"')
)
# 10 past four, quarter past four, half past four
graph_m_past_h = (
pynutil.insert('minutes: "')
+ pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose)
+ pynutil.insert('"')
+ delete_space
+ pynutil.delete("past")
+ delete_extra_space
+ final_graph_hour
)
graph_quarter_time = (
pynutil.insert('minutes: "')
+ pynini.cross("quarter", "45")
+ pynutil.insert('"')
+ delete_space
+ pynutil.delete(pynini.union("to", "till"))
+ delete_extra_space
+ pynutil.insert('hours: "')
+ to_hour_graph
+ pynutil.insert('"')
)
graph_m_to_h_suffix_time = (
pynutil.insert('minutes: "')
+ ((graph_minute_single | graph_minute_double).optimize() @ minute_to_graph)
+ pynutil.insert('"')
+ pynini.closure(
delete_space + pynutil.delete(pynini.union("min", "mins", "minute", "minutes")),
0,
1,
)
+ delete_space
+ pynutil.delete(pynini.union("to", "till"))
+ delete_extra_space
+ pynutil.insert('hours: "')
+ to_hour_graph
+ pynutil.insert('"')
+ final_suffix
)
graph_h = (
final_graph_hour
+ delete_extra_space
+ pynutil.insert('minutes: "')
+ (pynutil.insert("00") | graph_minute)
+ pynutil.insert('"')
+ final_suffix
+ final_time_zone_optional
)
final_graph = (
(graph_hm | graph_m_past_h | graph_quarter_time)
+ final_suffix_optional
+ final_time_zone_optional
)
final_graph |= graph_h
final_graph |= graph_m_to_h_suffix_time
final_graph = self.add_tokens(final_graph.optimize())
self.fst = final_graph.optimize()
import os
import pynini
from fun_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.en.taggers.date import DateFst
from fun_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.en.taggers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
from fun_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.en.taggers.time import TimeFst
from fun_text_processing.inverse_text_normalization.en.taggers.whitelist import WhiteListFst
from fun_text_processing.inverse_text_normalization.en.taggers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from pynini.lib import pynutil
import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
super().__init__(name="tokenize_and_classify", kind="classify")
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, "_en_itn.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars.")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(cardinal)
ordinal_graph = ordinal.fst
decimal = DecimalFst(cardinal)
decimal_graph = decimal.fst
measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
date_graph = DateFst(ordinal=ordinal).fst
word_graph = WordFst().fst
time_graph = TimeFst().fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
whitelist_graph = WhiteListFst().fst
punct_graph = PunctuationFst().fst
electronic_graph = ElectronicFst().fst
telephone_graph = TelephoneFst(cardinal).fst
classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
punct = (
pynutil.insert("tokens { ")
+ pynutil.add_weight(punct_graph, weight=1.1)
+ pynutil.insert(" }")
)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ token
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
graph = delete_space + graph + delete_space
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
import pynini
from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelisted tokens
e.g. misses -> tokens { name: "mrs." }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""
def __init__(self):
super().__init__(name="whitelist", kind="classify")
whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_SPACE, GraphFst
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
e.g. sleep -> tokens { name: "sleep" }
"""
def __init__(self):
super().__init__(name="word", kind="classify")
word = pynutil.insert('name: "') + pynini.closure(DAMO_NOT_SPACE, 1) + pynutil.insert('"')
self.fst = word.optimize()
import os
from typing import Union
import inflect
_inflect = inflect.engine()
def num_to_word(x: Union[str, int]):
"""
converts integer to spoken representation
Args
x: integer
Returns: spoken representation
"""
if isinstance(x, int):
x = str(x)
x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "")
return x
def get_abs_path(rel_path):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment