Unverified Commit 90c83387 authored by Sam Shleifer's avatar Sam Shleifer Committed by GitHub
Browse files

[MarianTokenizer] Switch to sacremoses for punc normalization (#5092)

parent 049e14f0
...@@ -82,11 +82,11 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -82,11 +82,11 @@ class MarianTokenizer(PreTrainedTokenizer):
def _setup_normalizer(self): def _setup_normalizer(self):
try: try:
from mosestokenizer import MosesPunctuationNormalizer from sacremoses import MosesPunctNormalizer
self.punc_normalizer = MosesPunctuationNormalizer(self.source_lang) self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
except ImportError: except (ImportError, FileNotFoundError):
warnings.warn("Recommended: pip install mosestokenizer") warnings.warn("Recommended: pip install sacremoses.")
self.punc_normalizer = lambda x: x self.punc_normalizer = lambda x: x
def normalize(self, x: str) -> str: def normalize(self, x: str) -> str:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment