Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Paraformer_FunASR_pytorch
Commits
70a8a9e0
Commit
70a8a9e0
authored
Oct 03, 2024
by
wangwei990215
Browse files
initial commit
parents
Pipeline
#1738
failed with stages
in 0 seconds
Changes
827
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1137 additions
and
0 deletions
+1137
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/fraction.py
...cessing/inverse_text_normalization/fr/taggers/fraction.py
+67
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/measure.py
...ocessing/inverse_text_normalization/fr/taggers/measure.py
+85
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/money.py
...processing/inverse_text_normalization/fr/taggers/money.py
+130
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/ordinal.py
...ocessing/inverse_text_normalization/fr/taggers/ordinal.py
+67
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/punctuation.py
...sing/inverse_text_normalization/fr/taggers/punctuation.py
+22
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/telephone.py
...essing/inverse_text_normalization/fr/taggers/telephone.py
+72
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/time.py
..._processing/inverse_text_normalization/fr/taggers/time.py
+109
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py
...se_text_normalization/fr/taggers/tokenize_and_classify.py
+108
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/whitelist.py
...essing/inverse_text_normalization/fr/taggers/whitelist.py
+19
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/word.py
..._processing/inverse_text_normalization/fr/taggers/word.py
+15
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/utils.py
...un_text_processing/inverse_text_normalization/fr/utils.py
+13
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/__init__.py
...ing/inverse_text_normalization/fr/verbalizers/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py
...ing/inverse_text_normalization/fr/verbalizers/cardinal.py
+38
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/date.py
...cessing/inverse_text_normalization/fr/verbalizers/date.py
+62
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py
...sing/inverse_text_normalization/fr/verbalizers/decimal.py
+81
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py
...g/inverse_text_normalization/fr/verbalizers/electronic.py
+35
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py
...ing/inverse_text_normalization/fr/verbalizers/fraction.py
+43
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/measure.py
...sing/inverse_text_normalization/fr/verbalizers/measure.py
+68
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/money.py
...essing/inverse_text_normalization/fr/verbalizers/money.py
+31
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py
...sing/inverse_text_normalization/fr/verbalizers/ordinal.py
+71
-0
No files found.
Too many changes to show.
To preserve performance only
827 of 827+
files are displayed.
Plain diff
Email patch
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/fraction.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_CHAR
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
FractionFst
(
GraphFst
):
"""
Finite state transducer for classifying fraction
e.g. demi -> tokens { fraction { numerator: "1" denominator: "2" } }
e.g. un et demi -> tokens { fraction { integer_part: "1" numerator: "1" denominator: "2" } }
e.g. trois et deux centième -> tokens { fraction { integer_part: "3" numerator: "2" denominator: "100" } }
Args:
cardinal: OrdinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"fraction"
,
kind
=
"classify"
)
# integer_part # numerator # denominator
graph_cardinal
=
cardinal
.
graph_no_exception
graph_strip_undo_root_change
=
pynini
.
string_file
(
get_abs_path
(
"data/fractions.tsv"
)
)
# add in absolute path
graph_strip_no_root_change
=
pynutil
.
delete
(
"ième"
)
# For no change to root
graph_strip_no_root_change
+=
pynutil
.
delete
(
"s"
).
ques
# for plurals
graph_strip
=
graph_strip_no_root_change
|
graph_strip_undo_root_change
self
.
fractional
=
((
pynini
.
closure
(
DAMO_CHAR
)
+
graph_strip
)
@
graph_cardinal
).
optimize
()
integer
=
pynutil
.
insert
(
'integer_part: "'
)
+
graph_cardinal
+
pynutil
.
insert
(
'" '
)
integer
+=
delete_space
integer
+=
pynutil
.
delete
(
"et"
)
# used to demarcate integer and fractional parts
numerator
=
pynutil
.
insert
(
'numerator: "'
)
+
graph_cardinal
+
pynutil
.
insert
(
'"'
)
denominator
=
pynutil
.
insert
(
' denominator: "'
)
+
self
.
fractional
+
pynutil
.
insert
(
'"'
)
# Demi (half) can occur alone without explicit numerator.
graph_demi_component
=
(
pynutil
.
delete
(
"demi"
)
+
pynutil
.
delete
(
"e"
).
ques
+
pynutil
.
delete
(
"s"
).
ques
)
graph_demi_component
+=
pynutil
.
insert
(
'numerator: "1" denominator: "2"'
)
graph_fraction_component
=
numerator
+
delete_space
+
denominator
graph_fraction_component
|=
graph_demi_component
self
.
graph_fraction_component
=
graph_fraction_component
graph
=
pynini
.
closure
(
integer
+
delete_space
,
0
,
1
)
+
graph_fraction_component
graph
=
graph
.
optimize
()
self
.
final_graph_wo_negative
=
graph
optional_graph_negative
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"moins"
,
'"true"'
)
+
delete_extra_space
,
0
,
1
,
)
graph
=
optional_graph_negative
+
graph
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/measure.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
get_singulars
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for classifying measure. Allows for plural form for unit.
e.g. moins onze kilogramme -> measure { negative: "true" cardinal { integer: "11" } units: "kg" }
e.g. trois heures -> measure { cardinal { integer: "3" } units: "h" }
e.g. demi gramme -> measure { fraction { numerator: "1" denominator: "2" } units: "g" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
fraction: FractionFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
,
decimal
:
GraphFst
,
fraction
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_prefix
=
pynini
.
string_file
(
get_abs_path
(
"data/measurements/magnitudes.tsv"
))
graph_unit_singular
=
pynini
.
string_file
(
get_abs_path
(
"data/measurements/measurements.tsv"
))
unit
=
get_singulars
(
graph_unit_singular
)
|
graph_unit_singular
unit
=
graph_prefix
.
ques
+
unit
optional_graph_negative
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"moins"
,
'"true"'
)
+
delete_extra_space
,
0
,
1
,
)
unit_misc
=
(
pynutil
.
insert
(
"/"
)
+
(
pynutil
.
delete
(
"par"
)
|
pynutil
.
delete
(
"à"
))
+
delete_space
+
unit
)
unit
=
(
pynutil
.
insert
(
'units: "'
)
+
(
unit
|
unit_misc
|
pynutil
.
add_weight
(
unit
+
delete_space
+
unit_misc
,
0.01
))
+
pynutil
.
insert
(
'"'
)
)
subgraph_decimal
=
(
pynutil
.
insert
(
"decimal { "
)
+
optional_graph_negative
+
decimal
.
final_graph_wo_negative
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit
)
subgraph_fraction
=
(
pynutil
.
insert
(
"fraction { "
)
+
optional_graph_negative
+
fraction
.
final_graph_wo_negative
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit
)
subgraph_cardinal
=
(
pynutil
.
insert
(
"cardinal { "
)
+
optional_graph_negative
+
pynutil
.
insert
(
'integer: "'
)
+
cardinal_graph
+
pynutil
.
insert
(
'"'
)
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit
)
final_graph
=
subgraph_decimal
|
subgraph_cardinal
|
subgraph_fraction
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/money.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_DIGIT
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for classifying money
e.g. douze euro cinq -> money { integer_part: "12" currency: "€" fractional_part: 05}
e.g. zéro euro cinq -> money { integer_part: "0" currency: "€" fractional_part: 05}
e.g. cinq centimes -> money { integer_part: "0" currency: "€" fractional_part: 05}
Note, the currency symbol seems more common for exact amounts and quantities less than 'un million'
For 'round' quantities of >=million (milliard, billion), the symbol is dropped. This allows
use of the 'de' preposition.
e.g. cinq millions d'euros -> money { integer_part: "5" currency: "d'euros" fractional_part: 00}
e.g. un milliard d'euro -> money { integer_part: "5" currency: "d'euro" fractional_part: 00}
e.g. trois virgule trois millions d'euros -> money { integer_part: "3" currency: "d'euros" fractional_part: 3}
Currency is included for uniform tagging.
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"classify"
)
# quantity, integer_part, fractional_part, currency
# quantities
cardinal_graph
=
cardinal
.
graph_no_exception
graph_decimal
=
decimal
.
final_graph_wo_negative
# Converts currency names to symbols
convert_currency_major
=
pynini
.
string_file
(
get_abs_path
(
"data/money/currency_major.tsv"
)
)
# major denominations
convert_currency_minor
=
pynini
.
string_file
(
get_abs_path
(
"data/money/currency_minor.tsv"
)
)
# minor denominations to major symbol. (e.g. 5 cents -> 0.05 $ )
accept_all_currency
=
(
convert_currency_major
|
convert_currency_minor
).
project
(
"input"
)
# recognizes all currencies
# Graphs for large round amounts ('deux billiards d'euros', 'un milliard de dollars')
graph_de
=
pynini
.
union
(
"de "
,
"des "
,
"d'"
)
# the use of de/d'only occurs with round amounts
graph_currency_component_large_round_amounts
=
graph_de
+
accept_all_currency
graph_currency_component_large_round_amounts
=
(
pynutil
.
insert
(
' currency: "'
)
+
graph_currency_component_large_round_amounts
+
pynutil
.
insert
(
'"'
)
)
graph_money_large_round_amounts
=
(
graph_decimal
+
delete_space
)
# graph_decimal includes tags and quantities already
graph_money_large_round_amounts
+=
graph_currency_component_large_round_amounts
# For standard currency
add_leading_zero_to_double_digit
=
(
DAMO_DIGIT
+
DAMO_DIGIT
)
|
(
pynutil
.
insert
(
"0"
)
+
DAMO_DIGIT
)
# Graphs integer denomination for large denominations (e.g. $)
graph_integer_component_major
=
(
pynutil
.
insert
(
'integer_part: "'
)
+
cardinal_graph
+
pynutil
.
insert
(
'"'
)
)
graph_integer_component_major
+=
delete_space
graph_currency_component_major
=
(
pynutil
.
insert
(
' currency: "'
)
+
convert_currency_major
+
pynutil
.
insert
(
'"'
)
)
graph_decimal_component_major
=
(
delete_space
+
pynutil
.
insert
(
' fractional_part: "'
)
+
(
cardinal_graph
@
add_leading_zero_to_double_digit
)
+
pynutil
.
insert
(
'"'
)
)
# Rare cases where 'et' will separate major and minor denominations.
delete_minor_currency
=
pynini
.
project
(
convert_currency_minor
,
"input"
)
delete_minor_currency
=
delete_extra_space
+
pynutil
.
delete
(
delete_minor_currency
)
delete_et
=
delete_extra_space
+
pynutil
.
delete
(
"et"
)
graph_money_major
=
(
graph_integer_component_major
+
graph_currency_component_major
+
delete_et
.
ques
+
graph_decimal_component_major
.
ques
+
delete_minor_currency
.
ques
)
# For cases when only small denominations are used.
graph_integer_component_minor
=
pynutil
.
insert
(
'integer_part: "0"'
)
graph_decimal_component_minor
=
(
pynutil
.
insert
(
' fractional_part: "'
)
+
(
cardinal_graph
@
add_leading_zero_to_double_digit
)
+
pynutil
.
insert
(
'"'
)
)
graph_decimal_component_minor
+=
delete_extra_space
graph_currency_component_minor
=
(
pynutil
.
insert
(
' currency: "'
)
+
convert_currency_minor
+
pynutil
.
insert
(
'"'
)
)
graph_money_minor
=
(
graph_integer_component_minor
+
graph_decimal_component_minor
+
graph_currency_component_minor
)
graph_money_standard_amounts
=
graph_money_major
|
graph_money_minor
final_graph
=
graph_money_large_round_amounts
|
graph_money_standard_amounts
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/ordinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for classifying ordinal
vingt-deuxième -> ordinal { integer: "22" morphosyntactic_features: "e" }
Also notes specific nouns that have unique normalization conventions.
For instance, 'siècles' are rendered in roman numerals when given an ordinal adjective.
e.g. dix-neuvième siècle -> XIXe
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"classify"
)
graph_cardinal
=
cardinal
.
graph_no_exception
graph_undo_root_change
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/digits_root_change.tsv"
)
)
# Returns base number to normal after root change.
graph_firsts
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/firsts.tsv"
))
graph_second
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/second.tsv"
))
graph_special_ordinals
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/key_nouns.tsv"
))
# Removes morpheme
graph_no_root_change
=
pynutil
.
delete
(
"ième"
)
# For no change to root
graph_strip_morpheme
=
pynini
.
union
(
graph_no_root_change
,
graph_undo_root_change
)
graph_strip_morpheme
=
DAMO_SIGMA
+
graph_strip_morpheme
graph_integer_component
=
graph_strip_morpheme
@
graph_cardinal
graph_morpheme_component
=
pynutil
.
insert
(
"e"
)
# Put the superscript in.
graph_morpheme_component
+=
pynini
.
accep
(
"s"
).
ques
# In case of plurals.
# Concatenate with cardinal graph.
graph_ordinal
=
pynutil
.
insert
(
'integer: "'
)
+
graph_integer_component
+
pynutil
.
insert
(
'"'
)
graph_ordinal
+=
(
pynutil
.
insert
(
' morphosyntactic_features: "'
)
+
graph_morpheme_component
)
# Leave open in case further morphems occur
# Primer has a different subscript depending on gender, need to take note if
# 'premier' or 'première'
graph_firsts
=
pynutil
.
insert
(
'integer: "1" morphosyntactic_features: "'
)
+
graph_firsts
# Second used 'd' as a superscript.
graph_second
=
pynutil
.
insert
(
'integer: "2" morphosyntactic_features: "'
)
+
graph_second
graph
=
graph_firsts
|
graph_second
|
graph_ordinal
# For roman numerals. Carries over designation to verbalizer
graph_special_ordinals
=
pynutil
.
insert
(
"/"
)
+
delete_space
+
graph_special_ordinals
graph
+=
graph_special_ordinals
.
ques
+
pynutil
.
insert
(
'"'
)
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/punctuation.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
GraphFst
from
pynini.lib
import
pynutil
class
PunctuationFst
(
GraphFst
):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"punctuation"
,
kind
=
"classify"
)
s
=
"!#$%&'()*+,-./:;<=>?@^_`{|}~"
guillemets
=
"
\u00AB
"
+
"
\u00BB
"
# quotation marks in French.
s
+=
guillemets
punct
=
pynini
.
union
(
*
s
)
graph
=
pynutil
.
insert
(
'name: "'
)
+
punct
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/telephone.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
GraphFst
,
delete_hyphen
,
delete_space
,
insert_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
TelephoneFst
(
GraphFst
):
"""
Finite state transducer for classifying telephone numbers. Assumes conventional grouping for Metropolitan France (and overseas departments)
(two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g.
"zero un quatre-vingt-deux zero deux vingt-deux cinquante" -> { number_part: "01 42 02 22 50" }
"zero un quatre deux zero deux deux deux cinq zero" -> { number_part: "01 42 02 22 50" }
In cases where only one digit of the first pairing is admitted, assumes that the 0 was skipped.
"une vingt-trois quatre-vingt zero six dix-sept" -> { number_part: "01 23 40 06 17" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"telephone"
,
kind
=
"classify"
)
# create `single_digits` and `double_digits` graphs as these will be
# the building blocks of possible telephone numbers
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
graph_ties
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/ties.tsv"
))
graph_ties_unique
=
pynini
.
string_file
((
get_abs_path
(
"data/numbers/ties_unique.tsv"
)))
graph_teen
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/teen.tsv"
))
graph_zero
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
double_digits
=
pynini
.
union
(
graph_teen
,
graph_ties_unique
,
(
graph_ties
+
pynutil
.
insert
(
"0"
)),
(
graph_ties
+
delete_hyphen
+
graph_digit
),
)
graph_first_pair
=
graph_zero
+
delete_space
+
graph_digit
graph_first_pair
|=
pynutil
.
insert
(
"0"
)
+
graph_digit
# if zero is omitted
graph_first_pair
+=
(
delete_space
+
insert_space
)
# delete_space since closure allows possible gaps to be removed
# All digits
single_digits
=
graph_digit
|
graph_zero
graph_pair_all_digits
=
single_digits
+
delete_space
graph_pair_all_digits
+=
single_digits
graph_all_digits
=
pynini
.
closure
(
graph_pair_all_digits
+
delete_space
+
insert_space
,
3
,
3
)
graph_all_digits
=
graph_first_pair
+
graph_all_digits
+
graph_pair_all_digits
# Paired digits
graph_pair_digits_and_ties
=
double_digits
|
graph_pair_all_digits
graph_digits_and_ties
=
pynini
.
closure
(
graph_pair_digits_and_ties
+
delete_space
+
insert_space
,
3
,
3
)
graph_digits_and_ties
=
(
graph_first_pair
+
graph_digits_and_ties
+
graph_pair_digits_and_ties
)
number_part
=
pynini
.
union
(
graph_all_digits
,
graph_digits_and_ties
)
number_part
=
pynutil
.
insert
(
'number_part: "'
)
+
number_part
+
pynutil
.
insert
(
'"'
)
graph
=
number_part
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/time.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
GraphFst
,
delete_space
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for classifying time
e.g. huit heures -> time { hours: "8" minutes: "00" }
e.g. treize heures -> time { hours: "13" minutes: "00" }
e.g. treize heures dix -> time { hours: "13" minutes: "10" }
e.g. huit heures du matin -> time { hours: "8" minutes: "00" suffix: "avant mid"}
e.g. huite heures du après midi -> time { hours: "8" minutes: "00" suffix: "après-midi"}
e.g. douze heures moins qart -> time { hours: "11" minutes: "45" }
e.g. douze heures et qart -> time { hours: "12" minutes: "15" }
e.g. midi et qart -> time { hours: "12" minutes: "15" }
e.g. minuit et medi -> time { hours: "0" minutes: "30" }
e.g. douze heures moins medi -> time { hours: "11" minutes: "30" }
e.g. douze heures moins trois -> time { hours: "11" minutes: "57" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"classify"
)
# hours, minutes, seconds, suffix, zone, style, speak_period
# time_zone = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
graph_hours_to
=
pynini
.
string_file
(
get_abs_path
(
"data/time/hours_to.tsv"
))
graph_minutes_to
=
pynini
.
string_file
(
get_abs_path
(
"data/time/minutes_to.tsv"
))
graph_hours
=
pynini
.
string_file
(
get_abs_path
(
"data/time/hours.tsv"
))
graph_minutes
=
pynini
.
string_file
(
get_abs_path
(
"data/time/minutes.tsv"
))
graph_suffix_am
=
pynini
.
string_file
(
get_abs_path
(
"data/time/time_suffix_am.tsv"
))
graph_suffix_pm
=
pynini
.
string_file
(
get_abs_path
(
"data/time/time_suffix_pm.tsv"
))
graph_suffix
=
pynini
.
cross
(
graph_suffix_am
,
"am"
)
|
pynini
.
cross
(
graph_suffix_pm
,
"pm"
)
# Mapping 'heures'
graph_heures
=
pynini
.
accep
(
"heure"
)
+
pynini
.
accep
(
"s"
).
ques
graph_heures
=
pynutil
.
delete
(
graph_heures
)
graph_hours
+=
delete_space
+
graph_heures
# Midi and minuit
graph_midi
=
pynini
.
cross
(
"midi"
,
"12"
)
graph_minuit
=
pynini
.
cross
(
"minuit"
,
"0"
)
# Mapping 'et demi' and 'et qart'
graph_et
=
pynutil
.
delete
(
"et"
)
+
delete_space
graph_demi
=
pynini
.
accep
(
"demi"
)
graph_demi
+=
pynini
.
accep
(
"e"
).
ques
# people vary on feminine or masculine form
graph_demi
=
pynini
.
cross
(
graph_demi
,
"30"
)
graph_quart
=
pynini
.
accep
(
"quart"
)
graph_quart
=
pynini
.
accep
(
"le "
).
ques
+
graph_quart
# sometimes used
graph_quart
=
pynini
.
cross
(
graph_quart
,
"15"
)
graph_trois_quart
=
pynini
.
cross
(
"trois quarts"
,
"45"
)
graph_fractions
=
pynini
.
union
(
graph_demi
,
graph_quart
,
graph_trois_quart
)
graph_et_fractions
=
graph_et
+
graph_fractions
# Hours component is usually just a cardinal + 'heures' (ignored in case of 'midi/minuit').
graph_hours_component
=
pynini
.
union
(
graph_hours
,
graph_midi
,
graph_minuit
)
graph_hours_component
=
(
pynutil
.
insert
(
'hours: "'
)
+
graph_hours_component
+
pynutil
.
insert
(
'"'
)
)
graph_hours_component
+=
delete_space
# Minutes component
graph_minutes_component
=
(
pynutil
.
insert
(
' minutes: "'
)
+
pynini
.
union
(
graph_minutes
,
graph_et_fractions
)
+
pynutil
.
insert
(
'"'
)
)
# Hour and minutes together. For 'demi' and 'qart', 'et' is used as a conjunction.
graph_time_standard
=
graph_hours_component
+
graph_minutes_component
.
ques
# For time until hour. "quatre heures moins qart" -> 4 h 00 - 0 h 15 = 3 h 45
graph_moins
=
pynutil
.
delete
(
"moins"
)
graph_moins
+=
delete_space
graph_hours_to_component
=
graph_hours
|
graph_midi
|
graph_minuit
graph_hours_to_component
@=
graph_hours_to
graph_hours_to_component
=
(
pynutil
.
insert
(
'hours: "'
)
+
graph_hours_to_component
+
pynutil
.
insert
(
'"'
)
)
graph_hours_to_component
+=
delete_space
graph_minutes_to_component
=
pynini
.
union
(
graph_minutes
,
graph_fractions
)
graph_minutes_to_component
@=
graph_minutes_to
graph_minutes_to_component
=
(
pynutil
.
insert
(
' minutes: "'
)
+
graph_minutes_to_component
+
pynutil
.
insert
(
'"'
)
)
graph_time_to
=
graph_hours_to_component
+
graph_moins
+
graph_minutes_to_component
graph_time_no_suffix
=
graph_time_standard
|
graph_time_to
graph_suffix_component
=
pynutil
.
insert
(
' suffix: "'
)
+
graph_suffix
+
pynutil
.
insert
(
'"'
)
graph_suffix_component
=
delete_space
+
graph_suffix_component
graph_suffix_component
=
graph_suffix_component
.
ques
final_graph
=
graph_time_no_suffix
+
graph_suffix_component
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py
0 → 100644
View file @
70a8a9e0
import
os
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
generator_main
,
)
from
fun_text_processing.inverse_text_normalization.fr.taggers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.fraction
import
FractionFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.punctuation
import
PunctuationFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.time
import
TimeFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.whitelist
import
WhiteListFst
from
fun_text_processing.inverse_text_normalization.fr.taggers.word
import
WordFst
from
pynini.lib
import
pynutil
import
logging
class
ClassifyFst
(
GraphFst
):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def
__init__
(
self
,
cache_dir
:
str
=
None
,
overwrite_cache
:
bool
=
False
):
super
().
__init__
(
name
=
"tokenize_and_classify"
,
kind
=
"classify"
)
far_file
=
None
if
cache_dir
is
not
None
and
cache_dir
!=
"None"
:
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
far_file
=
os
.
path
.
join
(
cache_dir
,
"_fr_itn.far"
)
if
not
overwrite_cache
and
far_file
and
os
.
path
.
exists
(
far_file
):
self
.
fst
=
pynini
.
Far
(
far_file
,
mode
=
"r"
)[
"tokenize_and_classify"
]
logging
.
info
(
f
"ClassifyFst.fst was restored from
{
far_file
}
."
)
else
:
logging
.
info
(
f
"Creating ClassifyFst grammars."
)
cardinal
=
CardinalFst
()
cardinal_graph
=
cardinal
.
fst
fraction
=
FractionFst
(
cardinal
)
fraction_graph
=
fraction
.
fst
ordinal
=
OrdinalFst
(
cardinal
)
ordinal_graph
=
ordinal
.
fst
decimal
=
DecimalFst
(
cardinal
)
decimal_graph
=
decimal
.
fst
measure_graph
=
MeasureFst
(
cardinal
=
cardinal
,
decimal
=
decimal
,
fraction
=
fraction
).
fst
date_graph
=
DateFst
(
cardinal
).
fst
word_graph
=
WordFst
().
fst
time_graph
=
TimeFst
().
fst
money_graph
=
MoneyFst
(
cardinal
,
decimal
).
fst
whitelist_graph
=
WhiteListFst
().
fst
punct_graph
=
PunctuationFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
telephone_graph
=
TelephoneFst
().
fst
classify
=
(
pynutil
.
add_weight
(
whitelist_graph
,
1.01
)
|
pynutil
.
add_weight
(
time_graph
,
1.05
)
|
pynutil
.
add_weight
(
date_graph
,
1.09
)
|
pynutil
.
add_weight
(
decimal_graph
,
1.08
)
|
pynutil
.
add_weight
(
measure_graph
,
1.1
)
|
pynutil
.
add_weight
(
cardinal_graph
,
1.1
)
|
pynutil
.
add_weight
(
ordinal_graph
,
1.1
)
|
pynutil
.
add_weight
(
fraction_graph
,
1.09
)
|
pynutil
.
add_weight
(
money_graph
,
1.07
)
|
pynutil
.
add_weight
(
telephone_graph
,
1.1
)
|
pynutil
.
add_weight
(
electronic_graph
,
1.1
)
|
pynutil
.
add_weight
(
word_graph
,
100
)
)
punct
=
(
pynutil
.
insert
(
"tokens { "
)
+
pynutil
.
add_weight
(
punct_graph
,
weight
=
1.1
)
+
pynutil
.
insert
(
" }"
)
)
token
=
pynutil
.
insert
(
"tokens { "
)
+
classify
+
pynutil
.
insert
(
" }"
)
token_plus_punct
=
(
pynini
.
closure
(
punct
+
pynutil
.
insert
(
" "
))
+
token
+
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
punct
)
)
graph
=
token_plus_punct
+
pynini
.
closure
(
delete_extra_space
+
token_plus_punct
)
graph
=
delete_space
+
graph
+
delete_space
self
.
fst
=
graph
.
optimize
()
if
far_file
:
generator_main
(
far_file
,
{
"tokenize_and_classify"
:
self
.
fst
})
logging
.
info
(
f
"ClassifyFst grammars are saved to
{
far_file
}
."
)
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/whitelist.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
GraphFst
,
convert_space
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
WhiteListFst
(
GraphFst
):
"""
Finite state transducer for classifying whitelisted tokens
e.g. misses -> tokens { name: "mrs." }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"whitelist"
,
kind
=
"classify"
)
whitelist
=
pynini
.
string_file
(
get_abs_path
(
"data/whitelist.tsv"
))
graph
=
pynutil
.
insert
(
'name: "'
)
+
convert_space
(
whitelist
)
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/word.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
DAMO_NOT_SPACE
,
GraphFst
from
pynini.lib
import
pynutil
class
WordFst
(
GraphFst
):
"""
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
e.g. sleep -> tokens { name: "sleep" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"word"
,
kind
=
"classify"
)
word
=
pynutil
.
insert
(
'name: "'
)
+
pynini
.
closure
(
DAMO_NOT_SPACE
,
1
)
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
word
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/utils.py
0 → 100644
View file @
70a8a9e0
import
os
def
get_abs_path
(
rel_path
):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
+
rel_path
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
CardinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { negative: "-" integer: "23" } -> -23
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"cardinal"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynutil
.
delete
(
"negative:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
,
0
,
1
,
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
self
.
numbers
=
graph
graph
=
optional_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/date.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
DateFst
(
GraphFst
):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "1" month: "janvier" preserve_order: true } -> 1 de enero
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"date"
,
kind
=
"verbalize"
)
convert_primer
=
pynini
.
cross
(
"1"
,
"1ᵉʳ"
)
day
=
(
pynutil
.
delete
(
"day:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
(
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
|
pynutil
.
add_weight
(
convert_primer
,
-
1
)
)
# first of the month is ordinal
+
pynutil
.
delete
(
'"'
)
)
month
=
(
pynutil
.
delete
(
"month:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
year
=
(
pynutil
.
delete
(
"year:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
# day month
graph_dm
=
day
+
delete_extra_space
+
month
graph_dmy
=
graph_dm
+
delete_extra_space
+
year
optional_preserve_order
=
pynini
.
closure
(
pynutil
.
delete
(
"preserve_order:"
)
+
delete_space
+
pynutil
.
delete
(
"true"
)
+
delete_space
|
pynutil
.
delete
(
"field_order:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
final_graph
=
(
graph_dm
|
graph_dmy
)
+
delete_space
+
optional_preserve_order
delete_tokens
=
self
.
delete_tokens
(
final_graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_DIGIT
,
DAMO_NON_BREAKING_SPACE
,
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
NumberParser
(
GraphFst
):
"""
Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for
strings of digits of four or more (inclusive). Groupings are separated by non-breaking space.
e.g. '1000' -> '1 000'
e.g. '1000,33333' -> '1 000,333 33
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"parser"
,
kind
=
"verbalize"
)
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing decimal, e.g.
decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" } -> -12.5006 billion
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"verbalize"
)
# Need parser to group digits by threes
exactly_three_digits
=
DAMO_DIGIT
**
3
at_most_three_digits
=
pynini
.
closure
(
DAMO_DIGIT
,
1
,
3
)
space_every_three_integer
=
(
at_most_three_digits
+
(
pynutil
.
insert
(
DAMO_NON_BREAKING_SPACE
)
+
exactly_three_digits
).
closure
()
)
space_every_three_decimal
=
(
pynini
.
accep
(
","
)
+
(
exactly_three_digits
+
pynutil
.
insert
(
DAMO_NON_BREAKING_SPACE
)).
closure
()
+
at_most_three_digits
)
group_by_threes
=
space_every_three_integer
|
space_every_three_decimal
self
.
group_by_threes
=
group_by_threes
optional_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
)
+
delete_space
,
0
,
1
)
integer
=
(
pynutil
.
delete
(
"integer_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
integer
=
integer
@
group_by_threes
optional_integer
=
pynini
.
closure
(
integer
+
delete_space
,
0
,
1
)
fractional
=
(
pynutil
.
insert
(
","
)
+
pynutil
.
delete
(
"fractional_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
fractional
=
fractional
@
group_by_threes
optional_fractional
=
pynini
.
closure
(
fractional
+
delete_space
,
0
,
1
)
quantity
=
(
pynutil
.
delete
(
"quantity:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_quantity
=
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
quantity
+
delete_space
,
0
,
1
)
graph
=
(
optional_integer
+
optional_fractional
+
optional_quantity
).
optimize
()
self
.
numbers
=
graph
graph
=
optional_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"verbalize"
)
user_name
=
(
pynutil
.
delete
(
"username:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
domain
=
(
pynutil
.
delete
(
"domain:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
user_name
+
delete_space
+
pynutil
.
insert
(
"@"
)
+
domain
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
FractionFst
(
GraphFst
):
"""
Finite state transducer for verbalizing fraction
e.g. fraction { integer_part: "1" numerator: "2" denominator: "3" } } -> 1 2/3
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"fraction"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
)
+
delete_space
,
0
,
1
)
integer
=
(
pynutil
.
delete
(
'integer_part: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
+
insert_space
)
numerator
=
(
pynutil
.
delete
(
'numerator: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
denominator
=
(
pynutil
.
insert
(
"/"
)
+
pynutil
.
delete
(
'denominator: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
(
pynini
.
closure
(
integer
+
delete_space
,
0
,
1
)
+
numerator
+
delete_space
+
denominator
).
optimize
()
self
.
numbers
=
graph
delete_tokens
=
self
.
delete_tokens
(
optional_sign
+
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/measure.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_CHAR
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for verbalizing measure, e.g.
measure { negative: "true" cardinal { integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
fraction: FractionFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
,
cardinal
:
GraphFst
,
fraction
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
),
0
,
1
)
unit
=
(
pynutil
.
delete
(
"units:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
graph_decimal
=
(
pynutil
.
delete
(
"decimal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
decimal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph_cardinal
=
(
pynutil
.
delete
(
"cardinal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
cardinal
.
numbers
@
decimal
.
group_by_threes
# measurements most obey three by three spacing
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph_fraction
=
(
pynutil
.
delete
(
"fraction {"
)
+
delete_space
+
optional_sign
+
delete_space
+
fraction
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
(
(
graph_cardinal
|
graph_decimal
|
graph_fraction
)
+
delete_space
+
pynutil
.
insert
(
" "
)
+
unit
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/money.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" fractional_part: "05" currency: "$" } -> 12.05 $
Args:
decimal: DecimalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"verbalize"
)
unit
=
(
pynutil
.
delete
(
"currency:"
)
+
delete_extra_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
decimal
.
numbers
+
delete_space
+
unit
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_DIGIT
,
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "e" } -> 13ᵉ
Given 'special' terms for ordinals (e.g. siècle), renders
amount in conventional format. e.g.
ordinal { integer: "13" morphosyntactic_features: "e/siècle" } -> XIIIᵉ
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"verbalize"
)
graph_integer
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
replace_suffix
=
pynini
.
union
(
pynini
.
cross
(
"e"
,
"ᵉ"
),
# only delete first quote since there may be more features
pynini
.
cross
(
"d"
,
"ᵈ"
),
pynini
.
cross
(
"r"
,
"ʳ"
),
pynini
.
cross
(
"s"
,
"ˢ"
),
)
replace_suffix
=
pynutil
.
delete
(
' morphosyntactic_features: "'
)
+
replace_suffix
.
plus
graph_arabic
=
graph_integer
+
replace_suffix
.
plus
# For roman.
graph_roman_digits
=
pynini
.
string_file
(
get_abs_path
(
"data/roman/digits_large.tsv"
)
).
invert
()
graph_roman_ties
=
pynini
.
string_file
(
get_abs_path
(
"data/roman/ties_large.tsv"
)).
invert
()
graph_roman_hundreds
=
pynini
.
string_file
(
get_abs_path
(
"data/roman/hundreds_large.tsv"
)
).
invert
()
graph_roman_zero_digit
=
pynutil
.
delete
(
"0"
)
graph_roman_hundreds
=
DAMO_DIGIT
**
3
@
(
graph_roman_hundreds
+
pynini
.
union
(
graph_roman_ties
,
graph_roman_zero_digit
)
+
pynini
.
union
(
graph_roman_digits
,
graph_roman_zero_digit
)
)
graph_roman_ties
=
DAMO_DIGIT
**
2
@
(
graph_roman_ties
+
pynini
.
union
(
graph_roman_digits
,
graph_roman_zero_digit
)
)
graph_roman_digits
=
DAMO_DIGIT
@
graph_roman_digits
graph_roman_integers
=
graph_roman_hundreds
|
graph_roman_ties
|
graph_roman_digits
graph_roman
=
(
graph_integer
@
graph_roman_integers
)
+
replace_suffix
graph_roman
+=
pynini
.
cross
(
"/"
,
" "
)
+
"siècle"
graph
=
(
graph_roman
|
graph_arabic
)
+
pynutil
.
delete
(
'"'
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
Prev
1
…
20
21
22
23
24
25
26
27
28
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment