Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone
e.g. telephone { number_part: "8-913-983-56-01" } -> "8-913-983-56-01"
"""
def __init__(self):
super().__init__(name="telephone", kind="verbalize")
graph = (
pynutil.delete('number_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time
e.g. time { hours: "02:15" } -> "02:15"
"""
def __init__(self):
super().__init__(name="time", kind="verbalize")
hour = (
pynutil.delete("hours: ")
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
minutes = (
pynutil.delete("minutes: ")
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
graph_preserve_order = (
pynutil.delete('hours: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
)
# for cases that require permutations for the correct verbalization
graph_reverse_order = hour + delete_space + pynutil.insert(":") + minutes + delete_space
graph = graph_preserve_order | graph_reverse_order
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
from fun_text_processing.inverse_text_normalization.en.verbalizers.whitelist import WhiteListFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.date import DateFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.time import TimeFst
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
class VerbalizeFst(GraphFst):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def __init__(self):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal = OrdinalFst()
ordinal_graph = ordinal.fst
decimal = DecimalFst()
decimal_graph = decimal.fst
whitelist_graph = WhiteListFst().fst
electronic_graph = ElectronicFst().fst
money_graph = MoneyFst().fst
date_graph = DateFst().fst
measure_graph = MeasureFst().fst
telephone_graph = TelephoneFst().fst
time_graph = TimeFst().fst
graph = (
whitelist_graph
| cardinal_graph
| ordinal_graph
| decimal_graph
| electronic_graph
| date_graph
| money_graph
| measure_graph
| telephone_graph
| time_graph
)
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst
from fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def __init__(self):
super().__init__(name="verbalize_final", kind="verbalize")
verbalize = VerbalizeFst().fst
word = WordFst().fst
types = verbalize | word
graph = (
pynutil.delete("tokens")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ types
+ delete_space
+ pynutil.delete("}")
)
graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
self.fst = graph
from argparse import ArgumentParser
from fun_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from fun_text_processing.text_normalization.data_loader_utils import (
evaluate,
known_types,
load_files,
training_data_to_sentences,
training_data_to_tokens,
)
"""
Runs Evaluation on data in the format of : <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text>
like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
"""
def parse_args():
parser = ArgumentParser()
parser.add_argument("--input", help="input file path", type=str)
parser.add_argument(
"--lang",
help="language",
choices=["en", "id", "ja", "de", "es", "pt", "ru", "fr", "vi", "ko", "zh", "fil"],
default="en",
type=str,
)
parser.add_argument(
"--cat",
dest="category",
help="focus on class only (" + ", ".join(known_types) + ")",
type=str,
default=None,
choices=known_types,
)
parser.add_argument(
"--filter", action="store_true", help="clean data for inverse normalization purposes"
)
return parser.parse_args()
if __name__ == "__main__":
# Example usage:
# python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter
args = parse_args()
if args.lang == "en":
from fun_text_processing.inverse_text_normalization.en.clean_eval_data import (
filter_loaded_data,
)
file_path = args.input
inverse_normalizer = InverseNormalizer()
print("Loading training data: " + file_path)
training_data = load_files([file_path])
if args.filter:
training_data = filter_loaded_data(training_data)
if args.category is None:
print("Sentence level evaluation...")
sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(training_data)
print("- Data: " + str(len(sentences_normalized)) + " sentences")
sentences_prediction = inverse_normalizer.inverse_normalize_list(sentences_normalized)
print("- Denormalized. Evaluating...")
sentences_accuracy = evaluate(
preds=sentences_prediction, labels=sentences_un_normalized, input=sentences_normalized
)
print("- Accuracy: " + str(sentences_accuracy))
print("Token level evaluation...")
tokens_per_type = training_data_to_tokens(training_data, category=args.category)
token_accuracy = {}
for token_type in tokens_per_type:
print("- Token type: " + token_type)
tokens_un_normalized, tokens_normalized = tokens_per_type[token_type]
print(" - Data: " + str(len(tokens_normalized)) + " tokens")
tokens_prediction = inverse_normalizer.inverse_normalize_list(tokens_normalized)
print(" - Denormalized. Evaluating...")
token_accuracy[token_type] = evaluate(
tokens_prediction, tokens_un_normalized, input=tokens_normalized
)
print(" - Accuracy: " + str(token_accuracy[token_type]))
token_count_per_type = {
token_type: len(tokens_per_type[token_type][0]) for token_type in tokens_per_type
}
token_weighted_accuracy = [
token_count_per_type[token_type] * accuracy
for token_type, accuracy in token_accuracy.items()
]
print("- Accuracy: " + str(sum(token_weighted_accuracy) / sum(token_count_per_type.values())))
print(" - Total: " + str(sum(token_count_per_type.values())), "\n")
for token_type in token_accuracy:
if token_type not in known_types:
raise ValueError("Unexpected token type: " + token_type)
if args.category is None:
c1 = ["Class", "sent level"] + known_types
c2 = ["Num Tokens", len(sentences_normalized)] + [
token_count_per_type[known_type] if known_type in tokens_per_type else "0"
for known_type in known_types
]
c3 = ["Denormalization", sentences_accuracy] + [
token_accuracy[known_type] if known_type in token_accuracy else "0"
for known_type in known_types
]
for i in range(len(c1)):
print(f"{str(c1[i]):10s} | {str(c2[i]):10s} | {str(c3[i]):5s}")
else:
print(f"numbers\t{token_count_per_type[args.category]}")
print(f"Denormalization\t{token_accuracy[args.category]}")
from fun_text_processing.inverse_text_normalization.tl.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.tl.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.tl.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
from argparse import ArgumentParser
from typing import List
import regex as re
from fun_text_processing.text_normalization.data_loader_utils import (
EOS_TYPE,
Instance,
load_files,
training_data_to_sentences,
)
"""
This file is for evaluation purposes.
filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually.
For example, normalized text should only include characters and whitespace characters but no punctuation.
Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
"""
class Filter:
"""
Filter class
Args:
class_type: semiotic class used in dataset
process_func: function to transform text
filter_func: function to filter text
"""
def __init__(self, class_type: str, process_func: object, filter_func: object):
self.class_type = class_type
self.process_func = process_func
self.filter_func = filter_func
def filter(self, instance: Instance) -> bool:
"""
filter function
Args:
filters given instance with filter function
Returns: True if given instance fulfills criteria or does not belong to class type
"""
if instance.token_type != self.class_type:
return True
return self.filter_func(instance)
def process(self, instance: Instance) -> Instance:
"""
process function
Args:
processes given instance with process function
Returns: processed instance if instance belongs to expected class type or original instance
"""
if instance.token_type != self.class_type:
return instance
return self.process_func(instance)
def filter_cardinal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_cardinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[^0-9]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_ordinal_1(instance: Instance) -> bool:
ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
return ok
def process_ordinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[,\s]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_decimal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_decimal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_measure_1(instance: Instance) -> bool:
ok = True
return ok
def process_measure_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"m2", "m²", un_normalized)
un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
normalized = re.sub(r"[^a-z\s]", "", normalized)
normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_money_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_money_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"a\$", r"$", un_normalized)
un_normalized = re.sub(r"us\$", r"$", un_normalized)
un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_time_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_time_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r": ", ":", un_normalized)
un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_plain_1(instance: Instance) -> bool:
ok = True
return ok
def process_plain_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_punct_1(instance: Instance) -> bool:
ok = True
return ok
def process_punct_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_date_1(instance: Instance) -> bool:
ok = True
return ok
def process_date_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_letters_1(instance: Instance) -> bool:
ok = True
return ok
def process_letters_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_verbatim_1(instance: Instance) -> bool:
ok = True
return ok
def process_verbatim_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_digit_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_digit_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_telephone_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_telephone_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_electronic_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_electronic_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_fraction_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_fraction_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_address_1(instance: Instance) -> bool:
ok = True
return ok
def process_address_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
filters = []
filters.append(
Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)
)
filters.append(
Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)
)
filters.append(
Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)
)
filters.append(
Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)
)
filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
filters.append(
Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)
)
filters.append(
Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)
)
filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
filters.append(
Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)
)
filters.append(
Filter(
class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1
)
)
filters.append(
Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)
)
filters.append(
Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)
)
filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
"""
Filters list of instances
Args:
data: list of instances
Returns: filtered and transformed list of instances
"""
updates_instances = []
for instance in data:
updated_instance = False
for fil in filters:
if fil.class_type == instance.token_type and fil.filter(instance):
instance = fil.process(instance)
updated_instance = True
if updated_instance:
if verbose:
print(instance)
updates_instances.append(instance)
return updates_instances
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--input", help="input file path", type=str, default="./en_with_types/output-00001-of-00100"
)
parser.add_argument("--verbose", help="print filtered instances", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
file_path = args.input
print("Loading training data: " + file_path)
instance_list = load_files([file_path]) # List of instances
filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
training_data_to_sentences(filtered_instance_list)
$ dolyar
$ us dollar
$ united states dollar
£ british pound
€ euro
₩ nanalo
nzd new zealand dolyar
rs rupee
chf swiss franc
dkk danish kroner
fim finnish markka
aed dirham ng emirates ng arab
¥ yen
czk czech koruna
mro mauritanian ouguiya
pkr pakistani rupee
crc costa rican colon
hk$ hong kong dolyar
npr nepalese rupee
awg aruban florin
nok norwegian kroner
tzs tanzanian shilling
sek swedish kronor
cyp cypriot pound
r real
sar saudi riyal
cve cape verde escudo
rsd serbian dinar
dm markang aleman
shp saint helena pounds
php philippine peso
cad canadian dollar
ssp timog sudanese pound
scr seychelles rupee
mvr maldivian rufiyaa
g mail gmail
gmail
n vidia nvidia
nvidia
outlook
hotmail
yahoo
aol
gmx
msn
live
yandex
orange
wanadoo
web
comcast
google
. dot
- dash
- hyphen
_ underscore
! exclamation mark
# number sign
$ dollar sign
% percent sign
& ampersand
' quote
* asterisk
+ plus
/ slash
= equal sign
? question mark
^ circumflex
` right single quote
{ left brace
| vertical bar
} right brace
~ tilde
, comma
\ No newline at end of file
f fahrenheit
c celsius
km kilometer
m meter
cm centimeter
mm millimeter
ha hectare
mi mile
m² square meter
km² square kilometer
ft foot
% percent
hz hertz
kw kilowatt
hp horsepower
mg milligram
kg kilogram
ghz gigahertz
khz kilohertz
mhz megahertz
v volt
h hour
mc mega coulomb
s second
nm nanometer
rpm revolution per minute
min minute
mA milli ampere
% per cent
kwh kilo watt hour
m³ cubic meter
mph mile per hour
tw tera watt
mv milli volt
mw megawatt
μm micrometer
" inch
tb terabyte
cc c c
g gram
da dalton
atm atmosphere
ω ohm
db decibel
ps peta second
oz ounce
hl hecto liter
μg microgram
pg petagram
gb gigabyte
kb kilobit
ev electron volt
mb megabyte
kb kilobyte
kbps kilobit per second
mbps megabit per second
st stone
kl kilo liter
tj tera joule
kv kilo volt
mv mega volt
kn kilonewton
mm megameter
au astronomical unit
yd yard
rad radian
lm lumen
hs hecto second
mol mole
gpa giga pascal
ml milliliter
gw gigawatt
ma mega ampere
kt knot
kgf kilogram force
ng nano gram
ns nanosecond
ms mega siemens
bar bar
gl giga liter
μs microsecond
da deci ampere
pa pascal
ds deci second
ms milli second
dm deci meter
dm³ cubic deci meter
amu atomic mass unit
mb megabit
mf mega farad
bq becquerel
pb petabit
mm² square millimeter
cm² square centimeter
sq mi square mile
sq ft square foot
kpa kilopascal
cd candela
tl tera liter
ms mega second
mpa megapascal
pm peta meter
pb peta byte
gwh giga watt hour
kcal kilo calory
gy gray
sv sievert
cwt hundredweight
cc c c
enero
pebrero
martsa
abril
maaaring
hunyo
hulyo
agosto
september
oktubre
nobyembre
disyembre
isa 1
dalawa 2
tatlo 3
apat 4
lima 5
anim 6
pito 7
walo 8
siyam 9
isang 1
dalawang 2
tatlong 3
apat na 4
limang 5
anim na 6
pitong 7
walong 8
siyam na 9
isang 1
dalawang 2
tatlong 3
limang 5
anim na 6
pitong 7
walong 8
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment