"...ldm/git@developer.sourcefind.cn:chenpangpang/ComfyUI.git" did not exist on "f83109f09bec04f39f028c275b4eb1231adba00a"
Commit ca4baf8c authored by Shijie Wu's avatar Shijie Wu
Browse files

Match order of casing in OSS XLM; Improve document; Clean up dependency

parent a175a9dc
...@@ -20,14 +20,11 @@ import json ...@@ -20,14 +20,11 @@ import json
import logging import logging
import os import os
import re import re
import sys
import unicodedata import unicodedata
from io import open from io import open
import jieba
import Mykytea
import sacremoses as sm import sacremoses as sm
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from pythainlp.tokenize import word_tokenize as th_word_tokenize
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_bert import BasicTokenizer from .tokenization_bert import BasicTokenizer
...@@ -93,6 +90,7 @@ def lowercase_and_remove_accent(text): ...@@ -93,6 +90,7 @@ def lowercase_and_remove_accent(text):
Lowercase and strips accents from a piece of text based on Lowercase and strips accents from a piece of text based on
https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
""" """
text = ' '.join(text)
text = text.lower() text = text.lower()
text = unicodedata.normalize("NFD", text) text = unicodedata.normalize("NFD", text)
output = [] output = []
...@@ -101,7 +99,7 @@ def lowercase_and_remove_accent(text): ...@@ -101,7 +99,7 @@ def lowercase_and_remove_accent(text):
if cat == "Mn": if cat == "Mn":
continue continue
output.append(char) output.append(char)
return "".join(output).lower() return "".join(output).lower().split(' ')
def replace_unicode_punct(text): def replace_unicode_punct(text):
...@@ -176,13 +174,13 @@ def romanian_preprocessing(text): ...@@ -176,13 +174,13 @@ def romanian_preprocessing(text):
class XLMTokenizer(PreTrainedTokenizer): class XLMTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities: BPE tokenizer for XLM
- lower case all inputs - Moses preprocessing & tokenization for most supported languages
- uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \ - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
`ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
fallback to BERT's BasicTokenizer if not. - (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary. (ex: "__classify__") to a vocabulary.
...@@ -244,30 +242,18 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -244,30 +242,18 @@ class XLMTokenizer(PreTrainedTokenizer):
def ja_tokenize(self, text): def ja_tokenize(self, text):
if self.ja_word_tokenizer is None: if self.ja_word_tokenizer is None:
try: try:
import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~')) self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
except RuntimeError: except:
logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) with the following steps") logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
logger.error("2. autoreconf -i") logger.error("2. autoreconf -i")
logger.error("3. ./configure --prefix=$HOME/local") logger.error("3. ./configure --prefix=$HOME/local")
logger.error("4. make && make install") logger.error("4. make && make install")
logger.error("5. pip install kytea")
import sys; sys.exit() import sys; sys.exit()
return list(self.ja_word_tokenizer.getWS(text)) return list(self.ja_word_tokenizer.getWS(text))
def zh_tokenize(self, text):
if self.zh_word_tokenizer is None:
try:
self.zh_word_tokenizer = StanfordSegmenter()
self.zh_word_tokenizer.default_config('zh')
except LookupError:
logger.error("Make sure you download stanford-segmenter (https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip) with the following steps")
logger.error("1. wget https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip -O /path/to/stanford-segmenter-2018-10-16.zip")
logger.error("2. cd /path/to && unzip stanford-segmenter-2018-10-16.zip")
logger.error("3. cd stanford-segmenter-2018-10-16 && cp stanford-segmenter-3.9.2.jar stanford-segmenter.jar")
logger.error("4. set env variable STANFORD_SEGMENTER=/path/to/stanford-segmenter-2018-10-16")
import sys; sys.exit()
return self.zh_word_tokenizer.segment(text)
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.encoder) return len(self.encoder)
...@@ -315,11 +301,44 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -315,11 +301,44 @@ class XLMTokenizer(PreTrainedTokenizer):
self.cache[token] = word self.cache[token] = word
return word return word
def _tokenize(self, text, lang='en'): def _tokenize(self, text, lang='en', bypass_tokenizer=False):
""" Tokenize a string. """ """
if self.do_lowercase_and_remove_accent: Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
text = lowercase_and_remove_accent(text)
if lang not in self.lang_with_custom_tokenizer: Details of tokenization:
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
- Install with `pip install sacremoses`
- [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
- Install with `pip install pythainlp`
- [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
- Install with the following steps:
```
git clone git@github.com:neubig/kytea.git && cd kytea
autoreconf -i
./configure --prefix=$HOME/local
make && make install
pip install kytea
```
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
- Install with `pip install jieba`
\* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
[preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
and set `bypass_tokenizer=True` to bypass the tokenizer.
Args:
- lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE.
Returns:
List of tokens.
"""
if bypass_tokenizer:
text = text.split()
elif lang not in self.lang_with_custom_tokenizer:
text = self.moses_pipeline(text, lang=lang) text = self.moses_pipeline(text, lang=lang)
# TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
if lang == 'ro': if lang == 'ro':
...@@ -327,9 +346,22 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -327,9 +346,22 @@ class XLMTokenizer(PreTrainedTokenizer):
text = self.moses_tokenize(text, lang=lang) text = self.moses_tokenize(text, lang=lang)
elif lang == 'th': elif lang == 'th':
text = self.moses_pipeline(text, lang=lang) text = self.moses_pipeline(text, lang=lang)
try:
if 'pythainlp' not in sys.modules:
from pythainlp.tokenize import word_tokenize as th_word_tokenize
except:
logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
logger.error("1. pip install pythainlp")
import sys; sys.exit()
text = th_word_tokenize(text) text = th_word_tokenize(text)
elif lang == 'zh': elif lang == 'zh':
# text = self.zh_tokenize(text) try:
if 'jieba' not in sys.modules:
import jieba
except:
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
logger.error("1. pip install jieba")
import sys; sys.exit()
text = ' '.join(jieba.cut(text)) text = ' '.join(jieba.cut(text))
text = self.moses_pipeline(text, lang=lang) text = self.moses_pipeline(text, lang=lang)
text = text.split() text = text.split()
...@@ -339,9 +371,13 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -339,9 +371,13 @@ class XLMTokenizer(PreTrainedTokenizer):
else: else:
raise ValueError('It should not reach here') raise ValueError('It should not reach here')
if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
text = lowercase_and_remove_accent(text)
split_tokens = [] split_tokens = []
for token in text: for token in text:
split_tokens.extend([t for t in self.bpe(token).split(' ')]) if token:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
return split_tokens return split_tokens
......
...@@ -11,8 +11,4 @@ regex ...@@ -11,8 +11,4 @@ regex
# For XLNet # For XLNet
sentencepiece sentencepiece
# For XLM # For XLM
sacremoses sacremoses
pythainlp \ No newline at end of file
kytea
nltk
jieba
\ No newline at end of file
...@@ -56,11 +56,7 @@ setup( ...@@ -56,11 +56,7 @@ setup(
'tqdm', 'tqdm',
'regex', 'regex',
'sentencepiece', 'sentencepiece',
'sacremoses', 'sacremoses'],
'pythainlp',
'kytea',
'nltk',
'jieba'],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
"pytorch_transformers=pytorch_transformers.__main__:main", "pytorch_transformers=pytorch_transformers.__main__:main",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment