Unverified Commit 651408a0 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`Styling`] stylify using ruff (#27144)



* try to stylify using ruff

* might need to remove these changes?

* use ruf format andruff check

* use isinstance instead of type comparision

* use # fmt: skip

* use # fmt: skip

* nits

* soem styling changes

* update ci job

* nits isinstance

* more files update

* nits

* more nits

* small nits

* check and format

* revert wrong changes

* actually use formatter instead of checker

* nits

* well docbuilder is overwriting this commit

* revert notebook changes

* try to nuke docbuilder

* style

* fix feature exrtaction test

* remve `indent-width = 4`

* fixup

* more nits

* update the ruff version that we use

* style

* nuke docbuilder styling

* leve the print for detected changes

* nits

* Remove file I/O
Co-authored-by: default avatarcharliermarsh <charlie.r.marsh@gmail.com>

* style

* nits

* revert notebook changes

* Add # fmt skip when possible

* Add # fmt skip when possible

* Fix

* More `  # fmt: skip` usage

* More `  # fmt: skip` usage

* More `  # fmt: skip` usage

* NIts

* more fixes

* fix tapas

* Another way to skip

* Recommended way

* Fix two more fiels

* Remove asynch
Remove asynch

---------
Co-authored-by: default avatarcharliermarsh <charlie.r.marsh@gmail.com>
parent acb5b4af
...@@ -123,9 +123,7 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -123,9 +123,7 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'input_ids': [[2, 21970, 13, 5, 6092, 167, 28, 7103, 2153, 673, 8, 7028, 12051, 18, 17, 7103, 2153, 673, 8, 3515, 18684, 8, 4461, 6, 1927, 297, 8, 12060, 2607, 18, 13, 5, 4461, 15, 10538, 38, 8, 135, 15, 822, 58, 15, 993, 10363, 15, 1460, 8005, 4461, 15, 993, 255, 2328, 9, 9, 9, 6, 26, 1112, 816, 3260, 13, 5, 103, 2377, 6, 17, 1112, 816, 2782, 13, 5, 103, 10641, 6, 29, 84, 2512, 2430, 782, 18684, 2761, 19, 808, 2430, 2556, 17, 855, 1480, 9477, 4091, 128, 11712, 15, 7103, 2153, 673, 17, 24883, 9990, 9, 3], [2, 11502, 25, 1006, 20, 782, 8, 11809, 855, 1732, 19393, 18667, 37, 367, 21018, 69, 1854, 34, 11860, 19124, 27, 156, 225, 17, 193, 4141, 19, 65, 9124, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
expected_encoding = {'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'input_ids': [[2, 21970, 13, 5, 6092, 167, 28, 7103, 2153, 673, 8, 7028, 12051, 18, 17, 7103, 2153, 673, 8, 3515, 18684, 8, 4461, 6, 1927, 297, 8, 12060, 2607, 18, 13, 5, 4461, 15, 10538, 38, 8, 135, 15, 822, 58, 15, 993, 10363, 15, 1460, 8005, 4461, 15, 993, 255, 2328, 9, 9, 9, 6, 26, 1112, 816, 3260, 13, 5, 103, 2377, 6, 17, 1112, 816, 2782, 13, 5, 103, 10641, 6, 29, 84, 2512, 2430, 782, 18684, 2761, 19, 808, 2430, 2556, 17, 855, 1480, 9477, 4091, 128, 11712, 15, 7103, 2153, 673, 17, 24883, 9990, 9, 3], [2, 11502, 25, 1006, 20, 782, 8, 11809, 855, 1732, 19393, 18667, 37, 367, 21018, 69, 1854, 34, 11860, 19124, 27, 156, 225, 17, 193, 4141, 19, 65, 9124, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E501
# fmt: on
self.tokenizer_integration_test_util( self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, expected_encoding=expected_encoding,
......
...@@ -574,7 +574,7 @@ class AltCLIPModelIntegrationTest(unittest.TestCase): ...@@ -574,7 +574,7 @@ class AltCLIPModelIntegrationTest(unittest.TestCase):
processor = AltCLIPProcessor.from_pretrained(model_name) processor = AltCLIPProcessor.from_pretrained(model_name)
image = prepare_img() image = prepare_img()
inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="pt").to(torch_device) inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="pt").to(torch_device) # fmt: skip
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -1029,10 +1029,8 @@ class BarkModelIntegrationTests(unittest.TestCase): ...@@ -1029,10 +1029,8 @@ class BarkModelIntegrationTests(unittest.TestCase):
def test_generate_semantic(self): def test_generate_semantic(self):
input_ids = self.inputs input_ids = self.inputs
# fmt: off
# check first ids # check first ids
expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,] expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,] # fmt: skip
# fmt: on
# greedy decoding # greedy decoding
with torch.no_grad(): with torch.no_grad():
...@@ -1049,10 +1047,8 @@ class BarkModelIntegrationTests(unittest.TestCase): ...@@ -1049,10 +1047,8 @@ class BarkModelIntegrationTests(unittest.TestCase):
input_ids = self.inputs input_ids = self.inputs
min_eos_p = 0.01 min_eos_p = 0.01
# fmt: off
# check first ids # check first ids
expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,] expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,] # fmt: skip
# fmt: on
# Should be able to read min_eos_p from kwargs # Should be able to read min_eos_p from kwargs
with torch.no_grad(): with torch.no_grad():
...@@ -1095,10 +1091,8 @@ class BarkModelIntegrationTests(unittest.TestCase): ...@@ -1095,10 +1091,8 @@ class BarkModelIntegrationTests(unittest.TestCase):
history_prompt = input_ids["history_prompt"] history_prompt = input_ids["history_prompt"]
# fmt: off
# check first ids # check first ids
expected_output_ids = [11018, 11391, 10651, 11418, 10857, 11620, 10642, 11366, 10312, 11528, 10531, 11516, 10474, 11051, 10524, 11051, ] expected_output_ids = [11018, 11391, 10651, 11418, 10857, 11620, 10642, 11366, 10312, 11528, 10531, 11516, 10474, 11051, 10524, 11051, ] # fmt: skip
# fmt: on
with torch.no_grad(): with torch.no_grad():
output_ids = self.model.semantic.generate( output_ids = self.model.semantic.generate(
......
...@@ -96,9 +96,7 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -96,9 +96,7 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[0, 490, 14328, 4507, 354, 47, 43669, 95, 25, 78117, 20215, 19779, 190, 22, 400, 4, 35343, 80310, 603, 86, 24937, 105, 33438, 94762, 196, 39642, 7, 15, 15933, 173, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 10534, 87, 25, 66, 3358, 196, 55289, 8, 82961, 81, 2204, 75203, 7, 15, 763, 12956, 216, 178, 14328, 9595, 1377, 69693, 7, 448, 71021, 196, 18106, 1437, 13974, 108, 9083, 4, 49315, 7, 39, 86, 1326, 2793, 46333, 4, 448, 196, 74588, 7, 49315, 7, 39, 21, 822, 38470, 74, 21, 66723, 62480, 8, 22050, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
expected_encoding = {'input_ids': [[0, 490, 14328, 4507, 354, 47, 43669, 95, 25, 78117, 20215, 19779, 190, 22, 400, 4, 35343, 80310, 603, 86, 24937, 105, 33438, 94762, 196, 39642, 7, 15, 15933, 173, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 10534, 87, 25, 66, 3358, 196, 55289, 8, 82961, 81, 2204, 75203, 7, 15, 763, 12956, 216, 178, 14328, 9595, 1377, 69693, 7, 448, 71021, 196, 18106, 1437, 13974, 108, 9083, 4, 49315, 7, 39, 86, 1326, 2793, 46333, 4, 448, 196, 74588, 7, 49315, 7, 39, 21, 822, 38470, 74, 21, 66723, 62480, 8, 22050, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
# fmt: on
# moussaKam/mbarthez is a french model. So we also use french texts. # moussaKam/mbarthez is a french model. So we also use french texts.
sequences = [ sequences = [
......
...@@ -233,9 +233,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -233,9 +233,7 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114], [448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
expected_encoding = {'input_ids': [[39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114], [448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E501
# fmt: on
self.tokenizer_integration_test_util( self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, expected_encoding=expected_encoding,
......
...@@ -198,12 +198,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -198,12 +198,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_sudachi_tokenizer_core(self): def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core") tokenizer = SudachiTokenizer(sudachi_dict_type="core")
# fmt: off
self.assertListEqual( self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
# fmt: off [" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "],
[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "],
# fmt: on
) )
# fmt: on
@require_sudachi @require_sudachi
def test_sudachi_tokenizer_split_mode_A(self): def test_sudachi_tokenizer_split_mode_A(self):
...@@ -227,23 +227,13 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -227,23 +227,13 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_sudachi_tokenizer_lower(self): def test_sudachi_tokenizer_lower(self):
tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core") tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")
self.assertListEqual( self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "]) # fmt: skip
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "],
# fmt: on
)
@require_sudachi @require_sudachi
def test_sudachi_tokenizer_no_normalize(self): def test_sudachi_tokenizer_no_normalize(self):
tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core") tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")
self.assertListEqual( self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "]) # fmt: skip
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
# fmt: off
[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "],
# fmt: on
)
@require_sudachi @require_sudachi
def test_sudachi_tokenizer_trim_whitespace(self): def test_sudachi_tokenizer_trim_whitespace(self):
...@@ -280,33 +270,19 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -280,33 +270,19 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = JumanppTokenizer() tokenizer = JumanppTokenizer()
self.assertListEqual( self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),["アップル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"]) # fmt: skip
# fmt: off
["アップル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],
# fmt: on
)
@require_jumanpp @require_jumanpp
def test_jumanpp_tokenizer_lower(self): def test_jumanpp_tokenizer_lower(self):
tokenizer = JumanppTokenizer(do_lower_case=True) tokenizer = JumanppTokenizer(do_lower_case=True)
self.assertListEqual( self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),["アップル", "ストア", "で", "iphone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],) # fmt: skip
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
# fmt: off
["アップル", "ストア", "で", "iphone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],
# fmt: on
)
@require_jumanpp @require_jumanpp
def test_jumanpp_tokenizer_no_normalize(self): def test_jumanpp_tokenizer_no_normalize(self):
tokenizer = JumanppTokenizer(normalize_text=False) tokenizer = JumanppTokenizer(normalize_text=False)
self.assertListEqual( self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),["ア", "ッ", "フ", "゚", "ル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],) # fmt: skip
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
# fmt: off
["ア", "ッ", "フ", "゚", "ル", "ストア", "で", "iPhone", "8", "\u3000", "が", "\u3000", "\u3000", "\u3000", "発売", "さ", "れた", "\u3000", "。"],
# fmt: on
)
@require_jumanpp @require_jumanpp
def test_jumanpp_tokenizer_trim_whitespace(self): def test_jumanpp_tokenizer_trim_whitespace(self):
...@@ -327,7 +303,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -327,7 +303,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
) )
def test_wordpiece_tokenizer(self): def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"] # fmt: skip
vocab = {} vocab = {}
for i, token in enumerate(vocab_tokens): for i, token in enumerate(vocab_tokens):
...@@ -340,14 +316,14 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -340,14 +316,14 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"]) self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"]) self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"]) # fmt: skip
def test_sentencepiece_tokenizer(self): def test_sentencepiece_tokenizer(self):
tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp") tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")
subword_tokenizer = tokenizer.subword_tokenizer subword_tokenizer = tokenizer.subword_tokenizer
tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。") tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。")
self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "国", "▁であった", "▁。"]) self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "国", "▁であった", "▁。"]) # fmt: skip
tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは") tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは")
self.assertListEqual(tokens, ["▁こん", "ばん", "は", "▁こん", "ばん", "▁に", "ち", "▁は", "▁こんにちは"]) self.assertListEqual(tokens, ["▁こん", "ばん", "は", "▁こん", "ばん", "▁に", "ち", "▁は", "▁こんにちは"])
...@@ -401,9 +377,7 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC ...@@ -401,9 +377,7 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character") tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。") tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
self.assertListEqual( self.assertListEqual(tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]) # fmt: skip
tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]
)
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12] tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
) )
......
...@@ -171,9 +171,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -171,9 +171,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will' 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth" " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
) )
# fmt: off original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # fmt: skip
original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # noqa: E231
# fmt: on
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols)) self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
@require_torch @require_torch
...@@ -225,9 +223,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -225,9 +223,7 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[65, 39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114, 66], [65, 448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [65, 484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
expected_encoding = {'input_ids': [[65, 39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114, 66], [65, 448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [65, 484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E501
# fmt: on
self.tokenizer_integration_test_util( self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, expected_encoding=expected_encoding,
......
...@@ -699,9 +699,7 @@ class BloomEmbeddingTest(unittest.TestCase): ...@@ -699,9 +699,7 @@ class BloomEmbeddingTest(unittest.TestCase):
}, },
} }
# fmt: off EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478] # fmt: skip
EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478]
# fmt: on
EMBEDDINGS_DS_AFTER_LN_MEAN = { EMBEDDINGS_DS_AFTER_LN_MEAN = {
3478: -6.580352783203125e-05, 3478: -6.580352783203125e-05,
...@@ -782,9 +780,7 @@ class BloomEmbeddingTest(unittest.TestCase): ...@@ -782,9 +780,7 @@ class BloomEmbeddingTest(unittest.TestCase):
) )
model.eval() model.eval()
# fmt: off EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478] # fmt: skip
EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478]
# fmt: on
MEAN_VALUE_LAST_LM = -4.3392181396484375e-05 MEAN_VALUE_LAST_LM = -4.3392181396484375e-05
MIN_MAX_DICT = {"min": -2.0625, "max": 2.75} MIN_MAX_DICT = {"min": -2.0625, "max": 2.75}
...@@ -812,9 +808,7 @@ class BloomEmbeddingTest(unittest.TestCase): ...@@ -812,9 +808,7 @@ class BloomEmbeddingTest(unittest.TestCase):
) # load in bf16 ) # load in bf16
model.eval() model.eval()
# fmt: off EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478] # fmt: skip
EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478]
# fmt: on
MEAN_LOGITS_GPU_1 = -1.823902130126953e-05 MEAN_LOGITS_GPU_1 = -1.823902130126953e-05
MEAN_LOGITS_GPU_2 = 1.9431114196777344e-05 MEAN_LOGITS_GPU_2 = 1.9431114196777344e-05
......
...@@ -119,9 +119,7 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -119,9 +119,7 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_prepare_batch_integration(self): def test_prepare_batch_integration(self):
tokenizer = self.t5_base_tokenizer tokenizer = self.t5_base_tokenizer
src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
# fmt: off expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 1, 0] # fmt: skip
expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 1, 0]
# fmt: on
batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK) batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
self.assertIsInstance(batch, BatchEncoding) self.assertIsInstance(batch, BatchEncoding)
...@@ -160,10 +158,8 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -160,10 +158,8 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = self.t5_base_tokenizer tokenizer = self.t5_base_tokenizer
src_text = ["A long paragraph for summarization. </s>"] src_text = ["A long paragraph for summarization. </s>"]
tgt_text = ["Summary of the text. </s>"] tgt_text = ["Summary of the text. </s>"]
# fmt: off expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 35, 1] # fmt: skip
expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 35, 1] expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1] # fmt: skip
expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1]
# fmt: on
batch = tokenizer(src_text, text_target=tgt_text) batch = tokenizer(src_text, text_target=tgt_text)
......
...@@ -115,9 +115,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -115,9 +115,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
# fmt: on
# camembert is a french model. So we also use french texts. # camembert is a french model. So we also use french texts.
sequences = [ sequences = [
......
...@@ -49,9 +49,7 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -49,9 +49,7 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_prepare_batch_integration(self): def test_prepare_batch_integration(self):
tokenizer = self.canine_tokenizer tokenizer = self.canine_tokenizer
src_text = ["Life is like a box of chocolates.", "You never know what you're gonna get."] src_text = ["Life is like a box of chocolates.", "You never know what you're gonna get."]
# fmt: off expected_src_tokens = [57344, 76, 105, 102, 101, 32, 105, 115, 32, 108, 105, 107, 101, 32, 97, 32, 98, 111, 120, 32, 111, 102, 32, 99, 104, 111, 99, 111, 108, 97, 116, 101, 115, 46, 57345, 0, 0, 0, 0] # fmt: skip
expected_src_tokens = [57344, 76, 105, 102, 101, 32, 105, 115, 32, 108, 105, 107, 101, 32, 97, 32, 98, 111, 120, 32, 111, 102, 32, 99, 104, 111, 99, 111, 108, 97, 116, 101, 115, 46, 57345, 0, 0, 0, 0]
# fmt: on
batch = tokenizer(src_text, padding=True, return_tensors="pt") batch = tokenizer(src_text, padding=True, return_tensors="pt")
self.assertIsInstance(batch, BatchEncoding) self.assertIsInstance(batch, BatchEncoding)
......
...@@ -714,9 +714,9 @@ class ChineseCLIPModelIntegrationTest(unittest.TestCase): ...@@ -714,9 +714,9 @@ class ChineseCLIPModelIntegrationTest(unittest.TestCase):
processor = ChineseCLIPProcessor.from_pretrained(model_name) processor = ChineseCLIPProcessor.from_pretrained(model_name)
image = prepare_img() image = prepare_img()
inputs = processor(text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="pt").to( inputs = processor(
torch_device text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="pt"
) ).to(torch_device)
# forward pass # forward pass
with torch.no_grad(): with torch.no_grad():
......
...@@ -38,9 +38,7 @@ class CLIPProcessorTest(unittest.TestCase): ...@@ -38,9 +38,7 @@ class CLIPProcessorTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
# fmt: off vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
# fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""] merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
self.special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
......
...@@ -36,9 +36,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -36,9 +36,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
# fmt: off vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
# fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"] merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
self.special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
......
...@@ -38,9 +38,7 @@ class CLIPSegProcessorTest(unittest.TestCase): ...@@ -38,9 +38,7 @@ class CLIPSegProcessorTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
# fmt: off vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
# fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""] merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
self.special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
......
...@@ -289,9 +289,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -289,9 +289,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
# fmt: on
self.tokenizer_integration_test_util( self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, expected_encoding=expected_encoding,
...@@ -409,8 +407,8 @@ class LlamaIntegrationTest(unittest.TestCase): ...@@ -409,8 +407,8 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test") self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
# bytefallback showcase # bytefallback showcase
self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) # fmt: skip
self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) # fmt: skip
self.assertEqual( self.assertEqual(
pyth_tokenizer.decode( pyth_tokenizer.decode(
[1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
......
...@@ -252,9 +252,7 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -252,9 +252,7 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[1, 39867, 36, 19390, 486, 27, 35052, 81436, 18, 60685, 1225, 7, 35052, 81436, 18, 9367, 16899, 18, 15937, 53, 594, 773, 18, 16287, 30465, 36, 15937, 6, 41139, 38, 36979, 60763, 191, 6, 34132, 99, 6, 50538, 390, 43230, 6, 34132, 2779, 20850, 14, 699, 1072, 1194, 36, 382, 10901, 53, 7, 699, 1072, 2084, 36, 20422, 630, 53, 19, 105, 3049, 1896, 1053, 16899, 1506, 11, 37978, 4243, 7, 1237, 31869, 200, 16566, 654, 6, 35052, 81436, 7, 55630, 13593, 4, 2], [1, 26, 15011, 13, 667, 8, 1053, 18, 23611, 1237, 72356, 12820, 34, 104134, 1209, 35, 13313, 6627, 21, 202, 347, 7, 164, 2399, 11, 46, 4485, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 5, 1232, 2864, 15785, 14951, 105, 5, 8581, 1250, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
expected_encoding = {'input_ids': [[1, 39867, 36, 19390, 486, 27, 35052, 81436, 18, 60685, 1225, 7, 35052, 81436, 18, 9367, 16899, 18, 15937, 53, 594, 773, 18, 16287, 30465, 36, 15937, 6, 41139, 38, 36979, 60763, 191, 6, 34132, 99, 6, 50538, 390, 43230, 6, 34132, 2779, 20850, 14, 699, 1072, 1194, 36, 382, 10901, 53, 7, 699, 1072, 2084, 36, 20422, 630, 53, 19, 105, 3049, 1896, 1053, 16899, 1506, 11, 37978, 4243, 7, 1237, 31869, 200, 16566, 654, 6, 35052, 81436, 7, 55630, 13593, 4, 2], [1, 26, 15011, 13, 667, 8, 1053, 18, 23611, 1237, 72356, 12820, 34, 104134, 1209, 35, 13313, 6627, 21, 202, 347, 7, 164, 2399, 11, 46, 4485, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 5, 1232, 2864, 15785, 14951, 105, 5, 8581, 1250, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E501
# fmt: on
self.tokenizer_integration_test_util( self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, expected_encoding=expected_encoding,
......
...@@ -474,7 +474,7 @@ class TFEncoderDecoderMixin: ...@@ -474,7 +474,7 @@ class TFEncoderDecoderMixin:
def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
pt_inputs_dict = {} pt_inputs_dict = {}
for name, key in tf_inputs_dict.items(): for name, key in tf_inputs_dict.items():
if type(key) == bool: if isinstance(key, bool):
pt_inputs_dict[name] = key pt_inputs_dict[name] = key
elif name == "input_values": elif name == "input_values":
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
......
...@@ -125,9 +125,7 @@ class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -125,9 +125,7 @@ class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_tokenizer_integration(self): def test_tokenizer_integration(self):
# fmt: off expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
# fmt: on
self.tokenizer_integration_test_util( self.tokenizer_integration_test_util(
expected_encoding=expected_encoding, expected_encoding=expected_encoding,
......
...@@ -32,9 +32,7 @@ class ESMTokenizationTest(unittest.TestCase): ...@@ -32,9 +32,7 @@ class ESMTokenizationTest(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
# fmt: off vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # fmt: skip
vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # noqa: E501
# fmt: on
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment