Unverified Commit da20209d authored by Hiroshi Matsuda's avatar Hiroshi Matsuda Committed by GitHub
Browse files

Add sudachi_projection option to BertJapaneseTokenizer (#28503)



* add sudachi_projection option

* Upgrade sudachipy>=0.6.8

* add a test case for sudachi_projection

* Compatible with older versions of SudachiPy

* make fixup

* make style

* error message for unidic download

* revert jumanpp test cases

* format options for sudachi_projection
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* format options for sudachi_split_mode and sudachi_dict_type

* comment

* add tests for full_tokenizer kwargs

* pass projection arg directly

* require_sudachi_projection

* make style

* revert upgrade sudachipy

* check is_sudachi_projection_available()

* revert dependency_version_table and bugfix

* style format

* simply raise ImportError
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* simply raise ImportError

---------
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent b4456753
...@@ -22,7 +22,7 @@ import unicodedata ...@@ -22,7 +22,7 @@ import unicodedata
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import is_sentencepiece_available, logging from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging
if is_sentencepiece_available(): if is_sentencepiece_available():
...@@ -542,6 +542,7 @@ class SudachiTokenizer: ...@@ -542,6 +542,7 @@ class SudachiTokenizer:
sudachi_config_path=None, sudachi_config_path=None,
sudachi_resource_dir=None, sudachi_resource_dir=None,
sudachi_dict_type="core", sudachi_dict_type="core",
sudachi_projection=None,
): ):
""" """
Constructs a SudachiTokenizer. Constructs a SudachiTokenizer.
...@@ -557,11 +558,13 @@ class SudachiTokenizer: ...@@ -557,11 +558,13 @@ class SudachiTokenizer:
**trim_whitespace**: (*optional*) boolean (default False) **trim_whitespace**: (*optional*) boolean (default False)
Whether to trim all whitespace, tab, newline from tokens. Whether to trim all whitespace, tab, newline from tokens.
**sudachi_split_mode**: (*optional*) string **sudachi_split_mode**: (*optional*) string
Split mode of sudachi, choose from "A", "B", "C". Split mode of sudachi, choose from `["A", "B", "C"]`.
**sudachi_config_path**: (*optional*) string **sudachi_config_path**: (*optional*) string
**sudachi_resource_dir**: (*optional*) string **sudachi_resource_dir**: (*optional*) string
**sudachi_dict_type**: (*optional*) string **sudachi_dict_type**: (*optional*) string
dict type of sudachi, choose from "small", "core", "full". dict type of sudachi, choose from `["small", "core", "full"]`.
**sudachi_projection**: (*optional*) string
Word projection mode of sudachi, choose from `["surface", "normalized", "reading", "dictionary", "dictionary_and_surface", "normalized_and_surface", "normalized_nouns"]`.
""" """
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
...@@ -586,9 +589,17 @@ class SudachiTokenizer: ...@@ -586,9 +589,17 @@ class SudachiTokenizer:
else: else:
raise ValueError("Invalid sudachi_split_mode is specified.") raise ValueError("Invalid sudachi_split_mode is specified.")
self.sudachi = dictionary.Dictionary( self.projection = sudachi_projection
sudachi_dictionary = dictionary.Dictionary(
config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
).create(self.split_mode) )
if is_sudachi_projection_available():
self.sudachi = sudachi_dictionary.create(self.split_mode, projection=self.projection)
elif self.projection is not None:
raise ImportError("You need to install sudachipy>=0.6.8 to specify `projection` field in sudachi_kwargs.")
else:
self.sudachi = sudachi_dictionary.create(self.split_mode)
def tokenize(self, text, never_split=None, **kwargs): def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text.""" """Tokenizes a piece of text."""
......
...@@ -95,6 +95,7 @@ from .utils import ( ...@@ -95,6 +95,7 @@ from .utils import (
is_soundfile_availble, is_soundfile_availble,
is_spacy_available, is_spacy_available,
is_sudachi_available, is_sudachi_available,
is_sudachi_projection_available,
is_tensorflow_probability_available, is_tensorflow_probability_available,
is_tensorflow_text_available, is_tensorflow_text_available,
is_tf2onnx_available, is_tf2onnx_available,
...@@ -1043,6 +1044,15 @@ def require_sudachi(test_case): ...@@ -1043,6 +1044,15 @@ def require_sudachi(test_case):
return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case) return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case)
def require_sudachi_projection(test_case):
"""
Decorator marking a test that requires sudachi_projection
"""
return unittest.skipUnless(is_sudachi_projection_available(), "test requires sudachi which supports projection")(
test_case
)
def require_jumanpp(test_case): def require_jumanpp(test_case):
""" """
Decorator marking a test that requires jumanpp Decorator marking a test that requires jumanpp
......
...@@ -163,6 +163,7 @@ from .import_utils import ( ...@@ -163,6 +163,7 @@ from .import_utils import (
is_spacy_available, is_spacy_available,
is_speech_available, is_speech_available,
is_sudachi_available, is_sudachi_available,
is_sudachi_projection_available,
is_tensorflow_probability_available, is_tensorflow_probability_available,
is_tensorflow_text_available, is_tensorflow_text_available,
is_tf2onnx_available, is_tf2onnx_available,
......
...@@ -135,7 +135,7 @@ if _sklearn_available: ...@@ -135,7 +135,7 @@ if _sklearn_available:
_smdistributed_available = importlib.util.find_spec("smdistributed") is not None _smdistributed_available = importlib.util.find_spec("smdistributed") is not None
_soundfile_available = _is_package_available("soundfile") _soundfile_available = _is_package_available("soundfile")
_spacy_available = _is_package_available("spacy") _spacy_available = _is_package_available("spacy")
_sudachipy_available = _is_package_available("sudachipy") _sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
_tensorflow_probability_available = _is_package_available("tensorflow_probability") _tensorflow_probability_available = _is_package_available("tensorflow_probability")
_tensorflow_text_available = _is_package_available("tensorflow_text") _tensorflow_text_available = _is_package_available("tensorflow_text")
_tf2onnx_available = _is_package_available("tf2onnx") _tf2onnx_available = _is_package_available("tf2onnx")
...@@ -896,6 +896,19 @@ def is_sudachi_available(): ...@@ -896,6 +896,19 @@ def is_sudachi_available():
return _sudachipy_available return _sudachipy_available
def get_sudachi_version():
return _sudachipy_version
def is_sudachi_projection_available():
if not is_sudachi_available():
return False
# NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the constructor of BertJapaneseTokenizer.
# - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230
return version.parse(_sudachipy_version) >= version.parse("0.6.8")
def is_jumanpp_available(): def is_jumanpp_available():
return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None) return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
......
...@@ -29,7 +29,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import ( ...@@ -29,7 +29,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
SudachiTokenizer, SudachiTokenizer,
WordpieceTokenizer, WordpieceTokenizer,
) )
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin
...@@ -60,6 +60,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -60,6 +60,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"##、", "##、",
"。", "。",
"##。", "##。",
"アップルストア",
"外国",
"##人",
"参政",
"##権",
"此れ",
"は",
"猫",
"です",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
...@@ -113,6 +122,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -113,6 +122,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, tokens_loaded) self.assertListEqual(tokens, tokens_loaded)
def test_mecab_full_tokenizer_with_mecab_kwargs(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}
)
text = "アップルストア"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["アップルストア"])
def test_mecab_tokenizer_ipadic(self): def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic") tokenizer = MecabTokenizer(mecab_dic="ipadic")
...@@ -134,6 +152,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -134,6 +152,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_mecab_tokenizer_unidic(self): def test_mecab_tokenizer_unidic(self):
try: try:
import unidic
self.assertTrue(
os.path.isdir(unidic.DICDIR),
"The content of unidic was not downloaded. Run `python -m unidic download` before running this test case. Note that this requires 2.1GB on disk.",
)
tokenizer = MecabTokenizer(mecab_dic="unidic") tokenizer = MecabTokenizer(mecab_dic="unidic")
except ModuleNotFoundError: except ModuleNotFoundError:
return return
...@@ -173,7 +197,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -173,7 +197,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"], ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"],
) )
@require_sudachi @require_sudachi_projection
def test_pickle_sudachi_tokenizer(self): def test_pickle_sudachi_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi") tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
self.assertIsNotNone(tokenizer) self.assertIsNotNone(tokenizer)
...@@ -194,7 +218,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -194,7 +218,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, tokens_loaded) self.assertListEqual(tokens, tokens_loaded)
@require_sudachi @require_sudachi_projection
def test_sudachi_tokenizer_core(self): def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core") tokenizer = SudachiTokenizer(sudachi_dict_type="core")
...@@ -205,37 +229,61 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -205,37 +229,61 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
) )
# fmt: on # fmt: on
@require_sudachi @require_sudachi_projection
def test_sudachi_tokenizer_split_mode_A(self): def test_sudachi_tokenizer_split_mode_A(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A") tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "人", "参政", "権"]) self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "人", "参政", "権"])
@require_sudachi @require_sudachi_projection
def test_sudachi_tokenizer_split_mode_B(self): def test_sudachi_tokenizer_split_mode_B(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B") tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"]) self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])
@require_sudachi @require_sudachi_projection
def test_sudachi_tokenizer_split_mode_C(self): def test_sudachi_tokenizer_split_mode_C(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C") tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"]) self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])
@require_sudachi @require_sudachi_projection
def test_sudachi_full_tokenizer_with_sudachi_kwargs_split_mode_B(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_split_mode": "B"}
)
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "##人", "参政", "##権"])
@require_sudachi_projection
def test_sudachi_tokenizer_projection(self):
tokenizer = SudachiTokenizer(
sudachi_dict_type="core", sudachi_split_mode="A", sudachi_projection="normalized_nouns"
)
self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"])
@require_sudachi_projection
def test_sudachi_full_tokenizer_with_sudachi_kwargs_sudachi_projection(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_projection": "normalized_nouns"}
)
self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"])
@require_sudachi_projection
def test_sudachi_tokenizer_lower(self): def test_sudachi_tokenizer_lower(self):
tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core") tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "]) # fmt: skip self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "]) # fmt: skip
@require_sudachi @require_sudachi_projection
def test_sudachi_tokenizer_no_normalize(self): def test_sudachi_tokenizer_no_normalize(self):
tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core") tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "]) # fmt: skip self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "]) # fmt: skip
@require_sudachi @require_sudachi_projection
def test_sudachi_tokenizer_trim_whitespace(self): def test_sudachi_tokenizer_trim_whitespace(self):
tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core") tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")
...@@ -293,6 +341,17 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -293,6 +341,17 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"], ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"],
) )
@require_jumanpp
def test_jumanpp_full_tokenizer_with_jumanpp_kwargs_trim_whitespace(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="jumanpp", jumanpp_kwargs={"trim_whitespace": True}
)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
@require_jumanpp @require_jumanpp
def test_jumanpp_tokenizer_ext(self): def test_jumanpp_tokenizer_ext(self):
tokenizer = JumanppTokenizer() tokenizer = JumanppTokenizer()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment