Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal, e.g.
decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" } -> -12.5006 billion
"""
def __init__(self):
super().__init__(name="decimal", kind="verbalize")
optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
integer = (
pynutil.delete("integer_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_integer = pynini.closure(integer + delete_space, 0, 1)
fractional = (
pynutil.insert(".")
+ pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_fractional = pynini.closure(fractional + delete_space, 0, 1)
quantity = (
pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1)
graph = optional_integer + optional_fractional + optional_quantity
self.numbers = graph
graph = optionl_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
"""
def __init__(self):
super().__init__(name="electronic", kind="verbalize")
user_name = (
pynutil.delete("username:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
domain = (
pynutil.delete("domain:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
protocol = (
pynutil.delete("protocol:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
graph = user_name + delete_space + pynutil.insert("@") + domain
graph |= protocol
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
DAMO_SIGMA,
delete_extra_space,
)
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for verbalizing fraction,
e.g. fraction { numerator: "2" denominator: "3" } } -> 2/3
e.g. fraction { numerator: "20" denominator: "3" negative: "true"} } -> 2/3
"""
def __init__(self):
super().__init__(name="fraction", kind="verbalize")
optional_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
numerator = (
pynutil.delete('numerator: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
)
denominator = (
pynutil.insert("/")
+ pynutil.delete('denominator: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
graph = (numerator + delete_space + denominator).optimize()
# numerator = pynutil.delete('numerator: "') + DAMO_NOT_QUOTE + pynutil.delete('"')
#
# denominator = (
# pynutil.delete('denominator: "')
# + DAMO_NOT_QUOTE
# + pynutil.delete('"')
# )
#
# graph = (numerator + pynutil.insert("/") + denominator).optimize()
self.numbers = graph
delete_tokens = self.delete_tokens(optional_sign + graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_CHAR,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure, e.g.
measure { negative: "true" cardinal { integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def __init__(self, decimal: GraphFst, cardinal: GraphFst):
super().__init__(name="measure", kind="verbalize")
optional_sign = pynini.closure(pynini.cross('negative: "true"', "-"), 0, 1)
unit = (
pynutil.delete("units:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
+ delete_space
)
graph_decimal = (
pynutil.delete("decimal {")
+ delete_space
+ optional_sign
+ delete_space
+ decimal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_cardinal = (
pynutil.delete("cardinal {")
+ delete_space
+ optional_sign
+ delete_space
+ cardinal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph = (graph_cardinal | graph_decimal) + delete_space + unit
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_CHAR,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" fractional_part: "05" currency: "$" } -> $12.05
Args:
decimal: DecimalFst
"""
def __init__(self, decimal: GraphFst):
super().__init__(name="money", kind="verbalize")
unit = (
pynutil.delete("currency:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = unit + delete_space + decimal.numbers
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_NOT_QUOTE,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" } -> 13th
"""
def __init__(self):
super().__init__(name="ordinal", kind="verbalize")
# convert_rest = pynutil.insert("第", weight=0.01)
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynutil.insert("第", weight=0.01)
# + DAMO_NOT_QUOTE
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
# convert_hundred = pynini.cross("第百", "第100")
# convert_eleven = pynini.cross("11", "十一")
# convert_twelve = pynini.cross("12", "十二")
# convert_thirteen = pynini.cross("13", "第十三")
# convert_one = pynini.cross("1", "第一")
# convert_two = pynini.cross("2", "第二")
# convert_three = pynini.cross("3", "第三")
# suffix = pynini.cdrewrite(
# convert_hundred
# # convert_eleven
# # | convert_twelve
# # | convert_thirteen
# # | convert_one
# # | convert_two
# # | convert_three,
# "",
# "[EOS]",
# DAMO_SIGMA,
# )
# graph = graph @ suffix
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def __init__(self):
super().__init__(name="telephone", kind="verbalize")
number_part = (
pynutil.delete('number_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_country_code = pynini.closure(
pynutil.delete('country_code: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
+ pynini.accep(" "),
0,
1,
)
delete_tokens = self.delete_tokens(optional_country_code + number_part)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_CHAR,
DAMO_DIGIT,
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time, e.g.
time { hours: "12" minutes: "30" } -> 12:30
time { hours: "1" minutes: "12" } -> 01:12
time { hours: "2" suffix: "a.m." } -> 02:00 a.m.
"""
def __init__(self):
super().__init__(name="time", kind="verbalize")
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
hour = (
pynutil.delete("hours:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
minute = (
pynutil.delete("minutes:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
suffix = (
delete_space
+ insert_space
+ pynutil.delete("suffix:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
optional_suffix = pynini.closure(suffix, 0, 1)
zone = (
delete_space
+ insert_space
+ pynutil.delete("zone:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
optional_zone = pynini.closure(zone, 0, 1)
graph = (
hour @ add_leading_zero_to_double_digit
+ delete_space
+ pynutil.insert(":")
+ (minute @ add_leading_zero_to_double_digit)
+ optional_suffix
+ optional_zone
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
from fun_text_processing.inverse_text_normalization.ja.verbalizers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.date import DateFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.time import TimeFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.whitelist import WhiteListFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.fraction import FractionFst
from fun_text_processing.inverse_text_normalization.ja.graph_utils import GraphFst
class VerbalizeFst(GraphFst):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def __init__(self):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal_graph = OrdinalFst().fst
decimal = DecimalFst()
decimal_graph = decimal.fst
fraction = FractionFst()
fraction_graph = fraction.fst
measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst
money_graph = MoneyFst(decimal=decimal).fst
time_graph = TimeFst().fst
date_graph = DateFst().fst
whitelist_graph = WhiteListFst().fst
telephone_graph = TelephoneFst().fst
electronic_graph = ElectronicFst().fst
graph = (
time_graph
| date_graph
| money_graph
| fraction_graph
| measure_graph
| ordinal_graph
| decimal_graph
| cardinal_graph
| whitelist_graph
| telephone_graph
| electronic_graph
)
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.ja.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.ja.verbalizers.word import WordFst
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def __init__(self):
super().__init__(name="verbalize_final", kind="verbalize")
verbalize = VerbalizeFst().fst
word = WordFst().fst
types = verbalize | word
graph = (
pynutil.delete("tokens")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ types
+ delete_space
+ pynutil.delete("}")
)
graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "mrs." } -> mrs.
"""
def __init__(self):
super().__init__(name="whitelist", kind="verbalize")
graph = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = graph @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
import pynini
from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(DAMO_CHAR - " ", 1)
char = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ chars
+ pynutil.delete('"')
)
graph = char @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
from fun_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
from argparse import ArgumentParser
from typing import List
import regex as re
from fun_text_processing.text_normalization.data_loader_utils import (
EOS_TYPE,
Instance,
load_files,
training_data_to_sentences,
)
"""
This file is for evaluation purposes.
filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually.
For example, normalized text should only include characters and whitespace characters but no punctuation.
Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
"""
class Filter:
"""
Filter class
Args:
class_type: semiotic class used in dataset
process_func: function to transform text
filter_func: function to filter text
"""
def __init__(self, class_type: str, process_func: object, filter_func: object):
self.class_type = class_type
self.process_func = process_func
self.filter_func = filter_func
def filter(self, instance: Instance) -> bool:
"""
filter function
Args:
filters given instance with filter function
Returns: True if given instance fulfills criteria or does not belong to class type
"""
if instance.token_type != self.class_type:
return True
return self.filter_func(instance)
def process(self, instance: Instance) -> Instance:
"""
process function
Args:
processes given instance with process function
Returns: processed instance if instance belongs to expected class type or original instance
"""
if instance.token_type != self.class_type:
return instance
return self.process_func(instance)
def filter_cardinal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_cardinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[^0-9]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_ordinal_1(instance: Instance) -> bool:
ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
return ok
def process_ordinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[,\s]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_decimal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_decimal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_measure_1(instance: Instance) -> bool:
ok = True
return ok
def process_measure_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"m2", "m²", un_normalized)
un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
normalized = re.sub(r"[^a-z\s]", "", normalized)
normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_money_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_money_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"a\$", r"$", un_normalized)
un_normalized = re.sub(r"us\$", r"$", un_normalized)
un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_time_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_time_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r": ", ":", un_normalized)
un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_plain_1(instance: Instance) -> bool:
ok = True
return ok
def process_plain_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_punct_1(instance: Instance) -> bool:
ok = True
return ok
def process_punct_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_date_1(instance: Instance) -> bool:
ok = True
return ok
def process_date_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_letters_1(instance: Instance) -> bool:
ok = True
return ok
def process_letters_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_verbatim_1(instance: Instance) -> bool:
ok = True
return ok
def process_verbatim_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_digit_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_digit_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_telephone_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_telephone_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_electronic_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_electronic_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_fraction_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_fraction_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_address_1(instance: Instance) -> bool:
ok = True
return ok
def process_address_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
filters = []
filters.append(
Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)
)
filters.append(
Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)
)
filters.append(
Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)
)
filters.append(
Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)
)
filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
filters.append(
Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)
)
filters.append(
Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)
)
filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
filters.append(
Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)
)
filters.append(
Filter(
class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1
)
)
filters.append(
Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)
)
filters.append(
Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)
)
filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
"""
Filters list of instances
Args:
data: list of instances
Returns: filtered and transformed list of instances
"""
updates_instances = []
for instance in data:
updated_instance = False
for fil in filters:
if fil.class_type == instance.token_type and fil.filter(instance):
instance = fil.process(instance)
updated_instance = True
if updated_instance:
if verbose:
print(instance)
updates_instances.append(instance)
return updates_instances
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--input", help="input file path", type=str, default="./en_with_types/output-00001-of-00100"
)
parser.add_argument("--verbose", help="print filtered instances", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
file_path = args.input
print("Loading training data: " + file_path)
instance_list = load_files([file_path]) # List of instances
filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
training_data_to_sentences(filtered_instance_list)
% 퍼센트
# 파운드
= 등호
@ 골뱅이
≥ 보다 크거나 같음
≤ 보다 작거나 같음
≠ 부등호
≈ 근삿값
± 플러스마이너스
× 곱셈기호
Α 알파
Β 베타
Γ 감마
Δ 델타
Ε 엡실론
Ζ 제타
Θ 세타
Ι 이오타
Κ 카파
∧ 람다
Μ 뮤
Ν 뉴
Ξ 크시
Ο 오미크론
∏ 파이
Ρ 로
∑ 싱마
Τ 타우
Υ 입실론
Φ 피
Χ 키
Ψ 프시
Ω 오메가
α 알파
β 베타
γ 감마
δ 델타
ε 엡실론
ζ 제타
η 에타
θ 세타
ι 이오타
κ 카파
λ 람다
μ 뮤
ν 뉴
ξ 크시
ο 오미크론
π 파이
ρ 로
σ 싱마
τ 타우
υ 입실론
φ 피
χ 키
ψ 프시
ω 오메가
\ No newline at end of file
$ 달러
$ 미국 달러
$ 미국 달러
£ 영국 파운드
€ 유로
₩ 원
nzd 뉴질랜드 달러
rs 루피
chf 스위스 프랑
dkk 덴마크 크로네
fim 핀란드 마르카
aed 아랍 에미리트 디르함
¥ 엔
czk 체코 코루나
mro 모리타니 우기야
pkr 파키스탄 루피
crc 코스타리카 콜론
hk$ 홍콩 달러
npr 네팔 루피
awg 아루반 플로린
nok 노르웨이 크로네
tzs 탄자니아 실링
sek 스웨덴 크로나
cyp 키프로스 파운드
sar 사우디 리얄
cve 케이프 베르데 에스쿠도
rsd 세르비아 디나르
dm 독일 마크
shp 세인트 헬레나 파운드
php 필리핀 페소
cad 캐나다 달러
ssp 남수단 파운드
scr 세이셸 루피
mvr 몰디브 루피야
\ No newline at end of file
일일 01
이일 02
삼일 03
사일 04
오일 05
육일 06
칠일 07
팔일 08
구일 09
십일 10
십일일 11
십이일 12
십삼일 13
십사일 14
십오일 15
십육일 16
십칠일 17
십팔일 18
십구일 19
이십일 20
이십일일 21
이십이일 22
이십삼일 23
이십사일 24
이십오일 25
이십육일 26
이십칠일 27
이십팔일 28
이십구일 29
삼십일 30
삼십일일 31
하루 01
이틀 02
사흘 03
나흘 04
닷새 05
엿새 06
이레 07
여드래 08
아흐레 09
열흘 10
열하루 11
열이틀 12
열사흘 13
스무날 20
스무하루 21
스무아흐레 29
그믐 30
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment