"megatron/fused_kernels/layer_norm_cuda_kernel.cu" did not exist on "6c3f6c7bb582b4509b28b64c3772e56f11627b7f"
Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
I 1
II 2
III 3
IV 4
V 5
VI 6
VII 7
VIII 8
IX 9
\ No newline at end of file
C 1
CC 2
CCC 3
CD 4
D 5
DC 6
DCC 7
DCCC 8
CM 9
\ No newline at end of file
X 1
XX 2
XXX 3
XL 4
L 5
LX 6
LXX 7
LXXX 8
XC 9
\ No newline at end of file
1 13
2 14
3 15
4 16
5 17
6 18
7 19
8 20
9 21
10 22
11 23
12 0
\ No newline at end of file
zéro 0
une 1
deux 2
trois 3
quatre 4
cinq 5
six 6
sept 7
huit 8
neuf 9
dix 10
onze 11
douze 12
treize 13
quatorze 14
quinze 15
seize 16
dix-sept 17
dix-huit 18
dix-neuf 19
vingt 20
vingt-et-une 21
vingt et une
vingt-deux 22
vingt-trois 23
vingt-quatre 24
\ No newline at end of file
1 0
2 1
3 2
4 3
5 4
6 5
7 6
8 7
9 8
10 9
11 10
12 11
13 12
14 13
15 14
16 15
17 16
18 17
19 18
20 19
21 20
22 21
23 22
24 23
0 23
\ No newline at end of file
une 01
deux 02
trois 03
quatre 04
cinq 05
six 06
sept 07
huit 08
neuf 09
dix 10
onze 11
douze 12
treize 13
quatorze 14
quinze 15
seize 16
dix-sept 17
dix-huit 18
dix-neuf 19
vingt 20
vingt-et-une 21
vingt et une 21
vingt-deux 22
vingt-trois 23
vingt-quatre 27
vingt-cinq 25
vingt-six 26
vingt-sept 27
vingt-huit 28
vingt-neuf 29
trente 30
trente-et-une 31
trente et une 31
trente-deux 32
trente-trois 33
trente-quatre 34
trente-cinq 35
trente-six 36
trente-sept 37
trente-huit 38
trente-neuf 39
quarante 40
quarante-et-une 41
quarante et une 41
quarante-deux 42
quarante-trois 43
quarante-quatre 44
quarante-cinq 45
quarante-six 46
quarante-sept 47
quarante-huit 48
quarante-neuf 49
cinquante 50
cinquante-et-une 51
cinquante et une 51
cinquante-deux 52
cinquante-trois 53
cinquante-quatre 54
cinquante-cinq 55
cinquante-six 56
cinquante-sept 57
cinquante-huit 58
cinquante-neuf 59
\ No newline at end of file
01 59
02 58
03 57
04 56
05 55
06 54
07 53
08 52
09 51
10 50
11 49
12 48
13 47
14 46
15 45
16 44
17 43
18 42
19 41
20 40
21 39
22 38
23 37
24 36
25 35
26 34
27 33
28 32
29 31
30 30
31 29
32 28
33 27
34 26
35 25
36 24
37 23
38 22
39 21
40 20
41 19
42 18
43 17
44 16
45 15
46 14
47 13
48 12
49 11
50 10
51 09
52 08
53 07
54 06
55 05
56 04
57 03
58 02
59 01
\ No newline at end of file
monsieur M.
messieurs MM.
madame Mᵐᵉ
mesdames Mᵐᵉˢ
mademoiselle Mˡˡᵉ
mademoiselles Mˡˡᵉˢ
docteur Dʳ
docteurs Dʳˢ
docteure Dʳᵉ
docteures Dʳᵉˢ
après jésus-christ apr. J.-C.
avant Jésus-Christ av. J.-C.
ca v.
vers v.
l’honorable le hon.
le très hononrable le très hon.
\ No newline at end of file
import os
import string
from pathlib import Path
from typing import Dict
import pynini
from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
from pynini import Far
from pynini.examples import plurals
from pynini.export import export
from pynini.lib import byte, pynutil, utf8
DAMO_CHAR = utf8.VALID_UTF8_CHAR
DAMO_DIGIT = byte.DIGIT
DAMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
DAMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
DAMO_HEX = pynini.union(*string.hexdigits).optimize()
DAMO_NON_BREAKING_SPACE = "\u00A0"
DAMO_SPACE = " "
DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
DAMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
DAMO_GRAPH = pynini.union(DAMO_ALNUM, DAMO_PUNCT).optimize()
DAMO_SIGMA = pynini.closure(DAMO_CHAR)
delete_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
# French frequently compounds numbers with hyphen.
delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1))
insert_hyphen = pynutil.insert("-")
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
_s = DAMO_SIGMA + pynutil.insert("s")
_x = DAMO_SIGMA + pynini.string_map([("eau"), ("eu"), ("ou")]) + pynutil.insert("x")
_aux = DAMO_SIGMA + pynini.string_map([("al", "aux"), ("ail", "aux")])
graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_s, pynini.union(_x, _aux), DAMO_SIGMA), DAMO_SIGMA
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
TO_LOWER = pynini.union(
*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
)
TO_UPPER = pynini.invert(TO_LOWER)
def generator_main(file_name: str, graphs: Dict[str, pynini.FstLike]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
Args:
file_name: exported file name
graphs: Mapping of a rule name and Pynini WFST graph to be exported
"""
exporter = export.Exporter(file_name)
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
print(f"Created {file_name}")
def get_plurals(fst):
"""
Given singular returns plurals
Args:
fst: Fst
Returns plurals to given singular forms
"""
return SINGULAR_TO_PLURAL @ fst
def get_singulars(fst):
"""
Given plural returns singulars
Args:
fst: Fst
Returns singulars to given plural forms
"""
return PLURAL_TO_SINGULAR @ fst
def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
Args:
fst: input fst
Returns output fst where breaking spaces are converted to non breaking spaces
"""
return fst @ pynini.cdrewrite(
pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
)
class GraphFst:
"""
Base class for all grammar fsts.
Args:
name: name of grammar class
kind: either 'classify' or 'verbalize'
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, name: str, kind: str, deterministic: bool = True):
self.name = name
self.kind = kind
self._fst = None
self.deterministic = deterministic
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
self._fst = Far(
self.far_path, mode="r", arc_type="standard", far_type="default"
).get_fst()
def far_exist(self) -> bool:
"""
Returns true if FAR can be loaded
"""
return self.far_path.exists()
@property
def fst(self) -> "pynini.FstLike":
return self._fst
@fst.setter
def fst(self, fst):
self._fst = fst
def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
res = (
pynutil.delete(f"{self.name}")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ fst
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
import pynini
from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
DAMO_CHAR,
DAMO_DIGIT,
DAMO_NOT_SPACE,
DAMO_SIGMA,
DAMO_SPACE,
GraphFst,
delete_hyphen,
)
from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
from pynini.lib import pynutil
def rewrite(cardinal: "pynini.FstLike") -> "pynini.FstLike":
"""
Function to rewrite cardinals written in traditional orthograph (no '-' for numbers >100)
to current orthography ('-' between all words in number string)
e.g. deux mille cent vingt-trois -> deux-mille-cent-vingt-trois.
In cases where original orthography is current, or string is mixture of two orthographies,
will render invalid form that will not pass through CardinalFst
e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.)
e.g. deux
Args:
cardinal: cardinal FST
"""
# Traditional orthography does not hyphenate numbers > 100, this will insert hyphens in
# those contexts.
targets = pynini.string_map(
[
"et", # for 'et un/onze'
"cent",
"mille",
"million",
"milliard",
"billion",
"billiard",
"trillion",
"trilliard",
]
)
targets += pynini.accep("s").ques
no_spaces = pynini.closure(DAMO_NOT_SPACE)
# Valid numbers in reformed orthography will have no spaces.
new_orthography_sigma = no_spaces
# Old orthography will not have these strings. Replacing with character to mark.
targets_for_filtering = ("-" + targets) | ("-" + targets + "-") | (targets + "-")
filter = pynini.cdrewrite(
pynini.cross(targets_for_filtering, "#"), "", "", DAMO_SIGMA
) # Invalid for cardinal
old_orthography_sigma = pynini.difference(
DAMO_CHAR, "#"
) # Marked character removed from sigma_star.
old_orthography_sigma.closure()
# Only accept strings that occur in old orthography. (This avoids tying two non-related numbers together.)
# e.g. mille cent-une -> mille-cent-une
filter @= old_orthography_sigma
# Now know replacements will only work around targets
replace_left = pynini.cdrewrite(pynini.cross(" ", "-"), "", targets, DAMO_SIGMA)
replace_right = pynini.cdrewrite(pynini.cross(" ", "-"), targets, "", DAMO_SIGMA)
replace = replace_left @ replace_right
graph = new_orthography_sigma | (filter @ replace)
return graph @ cardinal
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals
e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"}
This class converts cardinals up to (but not including) "un-quatrillion",
i.e up to "one septillion" in English (10^{24}).
Cardinals below nine are not converted (in order to avoid
"j'ai un pomme." --> "j'ai 1 pomme" and any other odd conversions.)
This transducer accomodates both traditional hyphenation of numbers ('-' for most numbers <100)
and current hyphenation (all elements of number are hyphenated), prioritizing the latter.
e.g cent cinquante et un -> cardinal { integer: "151"}
cent-cinquante-et-un -> cardinal { integer: "151"}
This is done through a context dependent rewrite that attempts to map old spelling to new.
e.g. cent cinquante et un -> cent-cinquante-et-un
"""
def __init__(self):
super().__init__(name="cardinal", kind="classify")
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_ties_unique = pynini.string_file(get_abs_path("data/numbers/ties_unique.tsv"))
# Tens components
graph_tens_component = graph_ties + ((delete_hyphen + graph_digit) | pynutil.insert("0"))
graph_tens_component = pynini.union(graph_tens_component, graph_teens, graph_ties_unique)
graph_tens_component_with_leading_zeros = pynini.union(
graph_tens_component,
(pynutil.insert("0") + (graph_digit | pynutil.insert("0", weight=0.01))),
)
# Hundreds components
graph_cent_singular = pynutil.delete("cent") # Used in hundreds place
graph_cent_plural = pynini.cross(
"cents", "00"
) # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201
graph_digit_no_one = pynini.project(pynini.union("un", "une"), "input")
graph_digit_no_one = (
pynini.project(graph_digit, "input") - graph_digit_no_one.arcsort()
) @ graph_digit
graph_hundreds_component_singular = (
graph_digit_no_one + delete_hyphen + graph_cent_singular
) # Regular way: [1-9] * 100
graph_hundreds_component_singular = pynini.union(
graph_hundreds_component_singular, pynini.cross("cent", "1")
)
graph_hundreds_component_singular += delete_hyphen
graph_hundreds_component_singular += graph_tens_component_with_leading_zeros
graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural
graph_hundreds_component = pynini.union(
graph_hundreds_component_singular,
graph_hundreds_component_plural,
pynutil.insert("0") + graph_tens_component_with_leading_zeros,
)
graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ (
pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
)
self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite(
graph_hundreds_component_at_least_one_none_zero_digit
).optimize()
# Graph thousands (we'll need this for cases of mille millions, mille milliards...)
graph_tens_of_hundreds_component_singular = (
graph_tens_component + delete_hyphen + graph_cent_singular
) # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents"
graph_tens_of_hundreds_component_singular += (
delete_hyphen + graph_tens_component_with_leading_zeros
)
graph_tens_of_hundreds_component_plural = (
graph_tens_component + delete_hyphen + graph_cent_plural
)
graph_tens_of_hundred_component = (
graph_tens_of_hundreds_component_plural | graph_tens_of_hundreds_component_singular
)
graph_thousands = pynini.union(
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ pynutil.delete("mille"),
pynutil.insert("001") + pynutil.delete("mille"), # because 'mille', not 'un mille'
pynutil.insert("000", weight=0.1),
)
# All other large amounts
graph_millions = pynini.union(
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ (pynutil.delete("million") | pynutil.delete("millions")),
pynutil.insert("000", weight=0.1),
)
graph_milliards = pynini.union( # French for English 'billion'
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ (pynutil.delete("milliard") | pynutil.delete("milliards")),
pynutil.insert("000", weight=0.1),
)
graph_billions = pynini.union( # NOTE: this is English 'trillion.'
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ (pynutil.delete("billions") | pynutil.delete("billion")),
pynutil.insert("000", weight=0.1),
)
graph_mille_billion = pynini.union(
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ pynutil.delete("mille"),
pynutil.insert("001")
+ pynutil.delete("mille"), # because we say 'mille', not 'un mille'
)
graph_mille_billion += delete_hyphen + (
graph_millions | pynutil.insert("000") + pynutil.delete("billions")
) # allow for 'mil millones'
graph_mille_billion |= pynutil.insert("000000", weight=0.1)
graph_billiards = pynini.union(
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ (pynutil.delete("billiards") | pynutil.delete("billiard")),
pynutil.insert("000", weight=0.1),
)
graph_trillions = pynini.union( # One thousand English trillions.
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ (pynutil.delete("trillions") | pynutil.delete("trillion")),
pynutil.insert("000", weight=0.1),
)
graph_trilliards = pynini.union(
graph_hundreds_component_at_least_one_none_zero_digit
+ delete_hyphen
+ (pynutil.delete("trilliards") | pynutil.delete("trilliard")),
pynutil.insert("000", weight=0.1),
)
graph = pynini.union(
graph_trilliards
+ delete_hyphen
+ graph_trillions
+ delete_hyphen
+ graph_billiards
+ delete_hyphen
+ graph_billions
+ delete_hyphen
+ graph_milliards
+ delete_hyphen
+ graph_millions
+ delete_hyphen
+ graph_thousands
+ delete_hyphen
+ graph_hundreds_component,
graph_tens_of_hundred_component,
graph_zero,
)
graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT),
"0",
)
graph = rewrite(graph)
self.graph_no_exception = graph.optimize()
# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand = DAMO_DIGIT | (DAMO_DIGIT**2) | (DAMO_DIGIT**3)
numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize()
self.numbers_up_to_thousand = numbers_up_to_thousand
# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million = (
DAMO_DIGIT
| (DAMO_DIGIT**2)
| (DAMO_DIGIT**3)
| (DAMO_DIGIT**4)
| (DAMO_DIGIT**5)
| (DAMO_DIGIT**6)
)
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
self.numbers_up_to_million = numbers_up_to_million
# don't convert cardinals from zero to nine inclusive
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("moins", '"-"') + DAMO_SPACE, 0, 1
)
final_graph = (
optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
GraphFst,
delete_extra_space,
)
from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for classifying date, in the form of (day) month (year) or year
e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
e.g. le premier janvier -> date { day: "1" month: "janvier" preserve_order: true }
Also will convert colloquialism of spelling in which tens of hundreds are used to express date. (e.g. nineteen hundred and four)
e.g. le vingt mais dix-neuf-cent-quatre -> date { day: "20" month: "mais" year: "1904" preserve_order: true }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="date", kind="classify")
self.cardinal = cardinal.graph_no_exception
year_graph = self.cardinal
month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
day_graph = self.cardinal | pynini.cross(
"premier", "1"
) # Premier is only ordinal used for dates
day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"')
optional_graph_year = pynini.closure(
delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"'),
0,
1,
)
graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
final_graph = graph_dmy
final_graph += pynutil.insert(" preserve_order: true")
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
DAMO_DIGIT,
GraphFst,
delete_extra_space,
delete_hyphen,
delete_space,
)
from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
from pynini.lib import pynutil
def get_quantity(
decimal: "pynini.FstLike", cardinal_up_to_thousand: "pynini.FstLike"
) -> "pynini.FstLike":
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions
Args:
decimal: decimal FST
cardinal_up_to_million: cardinal FST
"""
numbers = cardinal_up_to_thousand @ (
pynutil.delete(pynini.closure("0"))
+ pynini.difference(DAMO_DIGIT, "0")
+ pynini.closure(DAMO_DIGIT)
)
suffix = pynini.union(
"million",
"millions",
"milliard",
"milliards",
"billion",
"billions",
"billiard",
"billiards",
"trillion",
"trillions",
"trilliard",
"trilliards",
)
res = (
pynutil.insert('integer_part: "')
+ numbers
+ pynutil.insert('"')
+ (
pynini.union(delete_hyphen, delete_extra_space)
) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.
+ pynutil.insert(' quantity: "')
+ suffix
+ pynutil.insert('"')
)
res |= (
decimal + delete_extra_space + pynutil.insert(' quantity: "') + suffix + pynutil.insert('"')
)
return res
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal
Decimal point is "," (virgule).
e.g. moins un virgule deux six -> decimal { negative: "true" integer_part: "1" fractional_part: "26" }
This decimal rule assumes that decimals can be pronounced as:
(a cardinal) + ('virgule') plus (any sequence of cardinals <1 million, including 'zero')
Also writes large numbers in shortened form, e.g.
e.g. un virgule deux-six-million -> decimal { negative: "false" integer_part: "1" fractional_part: "26" quantity: "million" }
e.g. deux-million -> decimal { negative: "false" integer_part: "2" quantity: "millions" }
e.g. moins cent-vingt-quatre-millions -> decimal { negative: "true" integer_part: "124" quantity: "millions" }
Args:
cardinal: CardinalFst
"""
def __init__(self, cardinal: GraphFst):
super().__init__(name="decimal", kind="classify")
# number after decimal point can be any series of cardinals <1 million, including 'zero'
graph_decimal = cardinal.numbers_up_to_million
graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
self.graph = graph_decimal
# decimal point is denote by virgule
graph_fractional_separator = pynutil.delete("virgule")
# Possible negatives
optional_graph_negative = (
pynutil.insert("negative: ") + pynini.cross("moins", '"true"') + delete_extra_space
)
optional_graph_negative = optional_graph_negative.ques
# Fractional portion
graph_fractional = (
pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
)
# Integers
cardinal_graph = cardinal.graph_no_exception | pynini.string_file(
get_abs_path("data/numbers/zero.tsv")
)
graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
# Final graphs
final_graph_wo_sign = (
pynini.closure(graph_integer + delete_extra_space, 0, 1)
+ graph_fractional_separator
+ delete_extra_space
+ graph_fractional
)
final_graph = optional_graph_negative + final_graph_wo_sign
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit
)
final_graph |= optional_graph_negative + get_quantity(
final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit
)
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
DAMO_ALPHA,
GraphFst,
insert_space,
)
from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying 'electronic' semiotic classes, i.e.
email address (which get converted to "username" and "domain" fields),
and URLS (which get converted to a "protocol" field).
e.g. c d f une arobase a b c point e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
e.g. double vé double vé double vé a b c point e d u -> tokens { electronic { protocol: "www.abc.edu" } }
"""
def __init__(self):
super().__init__(name="electronic", kind="classify")
delete_extra_space = pynutil.delete(" ")
alpha_num = (
DAMO_ALPHA
| pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
)
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv"))
ampersand = pynini.string_map([("arobase"), ("chez"), ("at"), ("à")])
accepted_username = alpha_num | symbols
process_dot = pynini.cross("point", ".")
username = (
pynutil.insert('username: "')
+ alpha_num
+ delete_extra_space
+ pynini.closure(accepted_username + delete_extra_space)
+ alpha_num
+ pynutil.insert('"')
)
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
server = single_alphanum | pynini.string_file(
get_abs_path("data/electronic/server_name.tsv")
)
domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
domain_graph = (
pynutil.insert('domain: "')
+ server
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ domain
+ pynutil.insert('"')
)
graph = (
username
+ delete_extra_space
+ pynutil.delete(ampersand)
+ insert_space
+ delete_extra_space
+ domain_graph
)
############# url ###
protocol_end = pynini.cross(
pynini.union("www", "w w w", "double vé double vé double vé"), "www"
)
protocol_start = pynini.cross(pynini.union("http", "h t t p", "ache té té pé"), "http")
protocol_start |= pynini.cross(
pynini.union("https", "h t t p s", "ache té té pé esse"), "https"
)
protocol_start += pynini.cross(
pynini.union(
" deux-points barre oblique barre oblique ",
" deux-points barre barre ",
" deux-points double barre ",
" deux-points slash slash ",
),
"://",
)
# e.g. .com, .es
ending = (
delete_extra_space
+ symbols
+ delete_extra_space
+ (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)
)
protocol = (
pynini.closure(protocol_start, 0, 1)
+ protocol_end
+ delete_extra_space
+ process_dot
+ delete_extra_space
+ (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+ pynini.closure(ending, 1)
)
protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"')
graph |= protocol
########
final_graph = self.add_tokens(graph)
self.fst = final_graph.optimize()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment