Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_DIGIT, GraphFst, insert_space
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for classifying cardinals, e.g.
"тысяча один" -> cardinal { integer: "1 001" }
Args:
tn_cardinal: Text normalization Cardinal graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
graph = tn_cardinal.cardinal_numbers_default
self.graph = graph.invert().optimize()
optional_sign = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("минус ", '"-"') + insert_space, 0, 1
)
# do not invert numbers less than 10
graph = pynini.compose(graph, DAMO_DIGIT ** (2, ...))
graph = optional_sign + pynutil.insert('integer: "') + graph + pynutil.insert('"')
graph = self.add_tokens(graph)
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for classifying date, e.g.
восемнадцатое июня две тысячи второго -> tokens { date { day: "18.06.2002" } }
Args:
tn_date: Text normalization Date graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_date: GraphFst, deterministic: bool = True):
super().__init__(name="date", kind="classify", deterministic=deterministic)
graph = pynini.invert(tn_date.final_graph).optimize()
graph = self.add_tokens(pynutil.insert('day: "') + graph + pynutil.insert('"'))
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_SPACE,
GraphFst,
delete_extra_space,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for classifying decimal
e.g. "минус три целых две десятых" -> decimal { negative: "true" integer_part: "3," fractional_part: "2" }
Args:
tn_decimal: Text normalization Decimal graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_decimal, deterministic: bool = False):
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("минус", '"true"') + delete_extra_space,
0,
1,
)
graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize()
graph_integer_part = pynini.invert(tn_decimal.integer_part).optimize()
optional_graph_quantity = pynini.invert(tn_decimal.optional_quantity).optimize()
graph_fractional = (
pynutil.insert('fractional_part: "') + graph_fractional_part + pynutil.insert('"')
)
graph_integer = pynutil.insert('integer_part: "') + graph_integer_part + pynutil.insert('"')
optional_graph_quantity = (
pynutil.insert('quantity: "') + optional_graph_quantity + pynutil.insert('"')
)
optional_graph_quantity = pynini.closure(
pynini.accep(DAMO_SPACE) + optional_graph_quantity, 0, 1
)
self.final_graph_wo_sign = (
graph_integer + pynini.accep(DAMO_SPACE) + graph_fractional + optional_graph_quantity
)
final_graph = optional_graph_negative + self.final_graph_wo_sign
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic, e.g.
"эй би собака эн ди точка ру" -> electronic { username: "ab@nd.ru" }
Args:
tn_electronic: Text normalization Electronic graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_electronic, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)
graph = tn_electronic.final_graph
graph = graph.invert().optimize()
graph = pynutil.insert('username: "') + graph + pynutil.insert('"')
graph = self.add_tokens(graph)
self.fst = graph.optimize()
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, e.g.
"два килограма" -> measure { cardinal { integer: "2 кг" } }
Args:
tn_measure: Text normalization Cardinal graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_measure, deterministic: bool = True):
super().__init__(name="measure", kind="classify", deterministic=deterministic)
tn_measure = tn_measure.tagger_graph_default @ tn_measure.verbalizer_graph
graph = tn_measure.invert().optimize()
graph = pynutil.insert('cardinal { integer: "') + graph + pynutil.insert('" }')
graph = self.add_tokens(graph)
self.fst = graph.optimize()
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money, e.g.
"два рубля" -> money { integer_part: "2 руб." }
Args:
tn_money: Text normalization Money graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_money, deterministic: bool = True):
super().__init__(name="money", kind="classify", deterministic=deterministic)
graph = tn_money.final_graph
graph = graph.invert().optimize()
graph = pynutil.insert('integer_part: "') + graph + pynutil.insert('"')
graph = self.add_tokens(graph)
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_DIGIT, GraphFst
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinals, e.g.
"второе" -> ordinal { integer: "2" } }
Args:
tn_ordinal: Text normalization Ordinal graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_ordinal: GraphFst, deterministic: bool = True):
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
tn_ordinal = tn_ordinal.ordinal_numbers
graph = tn_ordinal.invert().optimize()
self.graph = graph
# do not invert numbers less than 10
graph = pynini.compose(graph, DAMO_DIGIT ** (2, ...))
graph = pynutil.insert('integer: "') + graph + pynutil.insert('"')
graph = self.add_tokens(graph)
self.fst = graph.optimize()
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone, e.g.
"восемь девятьсот тринадцать девятьсот восемьдесят три пятьдесят шесть ноль один" -> telephone { number_part: "8-913-983-56-01" }
Args:
tn_telephone: Text normalization telephone graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_telephone: GraphFst, deterministic: bool = True):
super().__init__(name="telephone", kind="classify", deterministic=deterministic)
tn_telephone = tn_telephone.final_graph
graph = tn_telephone.invert().optimize()
graph = pynutil.insert('number_part: "') + graph + pynutil.insert('"')
graph = self.add_tokens(graph)
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_SPACE, GraphFst
from fun_text_processing.text_normalization.ru.verbalizers.time import TimeFst as TNTimeVerbalizer
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time, e.g.
"два часа пятнадцать минут" -> time { hours: "02:15" }
Args:
tn_time: Text Normalization Time graph
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, tn_time: GraphFst, deterministic: bool = True):
super().__init__(name="time", kind="classify", deterministic=deterministic)
tn_time_tagger = tn_time.graph_preserve_order
tn_time_verbalizer = TNTimeVerbalizer().graph
tn_time_graph_preserve_order = pynini.compose(tn_time_tagger, tn_time_verbalizer).optimize()
graph_preserve_order = pynini.invert(tn_time_graph_preserve_order).optimize()
graph_preserve_order = (
pynutil.insert('hours: "') + graph_preserve_order + pynutil.insert('"')
)
# "пятнадцать минут шестого" -> 17:15
# Requires permutations for the correct verbalization
m_next_h = (
pynutil.insert('minutes: "')
+ pynini.invert(tn_time.minutes).optimize()
+ pynutil.insert('"')
+ pynini.accep(DAMO_SPACE)
+ pynutil.insert('hours: "')
+ pynini.invert(tn_time.increment_hour_ordinal).optimize()
+ pynutil.insert('"')
).optimize()
# "без пятнадцати минут шесть" -> 17:45
# Requires permutation for the correct verbalization
m_to_h = (
pynini.cross("без ", 'minutes: "')
+ pynini.invert(tn_time.mins_to_h)
+ pynutil.insert('"')
+ pynini.accep(DAMO_SPACE)
+ pynutil.insert('hours: "')
+ pynini.invert(tn_time.increment_hour_cardinal).optimize()
+ pynutil.insert('"')
)
graph_reserve_order = m_next_h | m_to_h
graph = graph_preserve_order | graph_reserve_order
graph = self.add_tokens(graph)
self.fst = graph.optimize()
import os
import pynini
from fun_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
from fun_text_processing.inverse_text_normalization.en.taggers.word import WordFst
from fun_text_processing.inverse_text_normalization.ru.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.ru.taggers.date import DateFst
from fun_text_processing.inverse_text_normalization.ru.taggers.decimals import DecimalFst
from fun_text_processing.inverse_text_normalization.ru.taggers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.ru.taggers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.ru.taggers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.ru.taggers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.ru.taggers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.ru.taggers.time import TimeFst
from fun_text_processing.inverse_text_normalization.ru.taggers.whitelist import WhiteListFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from fun_text_processing.text_normalization.ru.taggers.tokenize_and_classify import (
ClassifyFst as TNClassifyFst,
)
from pynini.lib import pynutil
import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
super().__init__(name="tokenize_and_classify", kind="classify")
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, "_ru_itn.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars. This might take some time...")
tn_classify = TNClassifyFst(
input_case="cased", deterministic=False, cache_dir=cache_dir, overwrite_cache=True
)
cardinal = CardinalFst(tn_cardinal=tn_classify.cardinal)
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(tn_ordinal=tn_classify.ordinal)
ordinal_graph = ordinal.fst
decimal = DecimalFst(tn_decimal=tn_classify.decimal)
decimal_graph = decimal.fst
measure_graph = MeasureFst(tn_measure=tn_classify.measure).fst
date_graph = DateFst(tn_date=tn_classify.date).fst
word_graph = WordFst().fst
time_graph = TimeFst(tn_time=tn_classify.time).fst
money_graph = MoneyFst(tn_money=tn_classify.money).fst
whitelist_graph = WhiteListFst().fst
punct_graph = PunctuationFst().fst
electronic_graph = ElectronicFst(tn_electronic=tn_classify.electronic).fst
telephone_graph = TelephoneFst(tn_telephone=tn_classify.telephone).fst
classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
punct = (
pynutil.insert("tokens { ")
+ pynutil.add_weight(punct_graph, weight=1.1)
+ pynutil.insert(" }")
)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ token
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(
pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct
)
graph = delete_space + graph + delete_space
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
from fun_text_processing.text_normalization.ru.utils import get_abs_path
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelist, e.g.
"квартира" -> telephone { number_part: "кв." }
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing roman numerals
e.g. cardinal { integer: "1 001" } -> 1 001
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic)
optional_sign = pynini.closure(
pynutil.delete("negative:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space,
0,
1,
)
graph = (
optional_sign
+ pynutil.delete('integer: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "02.03.89" } -> "02.03.89"
"""
def __init__(self):
super().__init__(name="date", kind="verbalize")
graph = pynutil.delete('day: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
delete_tokens = self.delete_tokens(graph.optimize())
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
DAMO_SPACE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal, e.g.
decimal { negative: "true" integer_part: "3," fractional_part: "2" } -> -3,2
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)
optional_sign = pynini.closure(pynini.cross('negative: "true" ', "-"), 0, 1)
integer = pynutil.delete(' "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
integer_part = pynutil.delete("integer_part:") + integer
fractional_part = pynutil.delete("fractional_part:") + integer
optional_quantity = pynini.closure(
pynini.accep(DAMO_SPACE) + pynutil.delete("quantity:") + integer, 0, 1
)
graph = optional_sign + integer_part + delete_space + fractional_part + optional_quantity
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. electronic { username: "ab@nd.ru" } -> "ab@nd.ru"
"""
def __init__(self):
super().__init__(name="electronic", kind="verbalize")
graph = (
pynutil.delete('username: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure
e.g. measure { cardinal { integer: "2 кг" } } -> "2 кг"
"""
def __init__(self):
super().__init__(name="measure", kind="verbalize")
graph = (
pynutil.delete(' cardinal { integer: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
+ delete_space
+ pynutil.delete("}")
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. money { integer_part: "2 руб." } -> "2 руб."
"""
def __init__(self):
super().__init__(name="money", kind="verbalize")
graph = (
pynutil.delete('integer_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing ordinal numbers
e.g. ordinal { integer: "2" } -> "2"
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)
value = pynini.closure(DAMO_NOT_QUOTE)
graph = pynutil.delete('integer: "') + value + pynutil.delete('"')
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment