Unverified Commit 6494910f authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Add sentencepiece to the CI and fix tests (#8672)

* Fix the CI and tests

* Fix quality

* Remove that m form nowhere
parent 0ad45e10
...@@ -77,7 +77,7 @@ jobs: ...@@ -77,7 +77,7 @@ jobs:
- v0.4-torch_and_tf-{{ checksum "setup.py" }} - v0.4-torch_and_tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing] - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
- save_cache: - save_cache:
key: v0.4-{{ checksum "setup.py" }} key: v0.4-{{ checksum "setup.py" }}
paths: paths:
...@@ -103,7 +103,7 @@ jobs: ...@@ -103,7 +103,7 @@ jobs:
- v0.4-torch-{{ checksum "setup.py" }} - v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing] - run: pip install .[sklearn,torch,testing,sentencepiece]
- save_cache: - save_cache:
key: v0.4-torch-{{ checksum "setup.py" }} key: v0.4-torch-{{ checksum "setup.py" }}
paths: paths:
...@@ -129,7 +129,7 @@ jobs: ...@@ -129,7 +129,7 @@ jobs:
- v0.4-tf-{{ checksum "setup.py" }} - v0.4-tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing] - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
- save_cache: - save_cache:
key: v0.4-tf-{{ checksum "setup.py" }} key: v0.4-tf-{{ checksum "setup.py" }}
paths: paths:
...@@ -155,7 +155,7 @@ jobs: ...@@ -155,7 +155,7 @@ jobs:
- v0.4-flax-{{ checksum "setup.py" }} - v0.4-flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: sudo pip install .[flax,sklearn,torch,testing] - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
- save_cache: - save_cache:
key: v0.4-flax-{{ checksum "setup.py" }} key: v0.4-flax-{{ checksum "setup.py" }}
paths: paths:
...@@ -181,7 +181,7 @@ jobs: ...@@ -181,7 +181,7 @@ jobs:
- v0.4-torch-{{ checksum "setup.py" }} - v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing] - run: pip install .[sklearn,torch,testing,sentencepiece]
- save_cache: - save_cache:
key: v0.4-torch-{{ checksum "setup.py" }} key: v0.4-torch-{{ checksum "setup.py" }}
paths: paths:
...@@ -207,7 +207,7 @@ jobs: ...@@ -207,7 +207,7 @@ jobs:
- v0.4-tf-{{ checksum "setup.py" }} - v0.4-tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing] - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
- save_cache: - save_cache:
key: v0.4-tf-{{ checksum "setup.py" }} key: v0.4-tf-{{ checksum "setup.py" }}
paths: paths:
...@@ -231,7 +231,7 @@ jobs: ...@@ -231,7 +231,7 @@ jobs:
- v0.4-custom_tokenizers-{{ checksum "setup.py" }} - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[ja,testing] - run: pip install .[ja,testing,sentencepiece]
- run: python -m unidic download - run: python -m unidic download
- save_cache: - save_cache:
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }} key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
...@@ -258,7 +258,7 @@ jobs: ...@@ -258,7 +258,7 @@ jobs:
- v0.4-torch_examples-{{ checksum "setup.py" }} - v0.4-torch_examples-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing] - run: pip install .[sklearn,torch,sentencepiece,testing]
- run: pip install -r examples/requirements.txt - run: pip install -r examples/requirements.txt
- save_cache: - save_cache:
key: v0.4-torch_examples-{{ checksum "setup.py" }} key: v0.4-torch_examples-{{ checksum "setup.py" }}
...@@ -324,7 +324,7 @@ jobs: ...@@ -324,7 +324,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install isort - run: pip install isort
- run: pip install .[tf,torch,flax,quality] - run: pip install .[all,quality]
- save_cache: - save_cache:
key: v0.4-code_quality-{{ checksum "setup.py" }} key: v0.4-code_quality-{{ checksum "setup.py" }}
paths: paths:
......
...@@ -188,7 +188,7 @@ class MBartTokenizer(XLMRobertaTokenizer): ...@@ -188,7 +188,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
**kwargs, **kwargs,
) -> BatchEncoding: ) -> BatchEncoding:
if max_length is None: if max_length is None:
max_length = self.max_len max_length = self.model_max_length
self.set_src_lang_special_tokens(src_lang) self.set_src_lang_special_tokens(src_lang)
model_inputs: BatchEncoding = self( model_inputs: BatchEncoding = self(
src_texts, src_texts,
......
...@@ -185,7 +185,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): ...@@ -185,7 +185,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
**kwargs, **kwargs,
) -> BatchEncoding: ) -> BatchEncoding:
if max_length is None: if max_length is None:
max_length = self.max_len max_length = self.model_max_length
self.set_src_lang_special_tokens(src_lang) self.set_src_lang_special_tokens(src_lang)
model_inputs: BatchEncoding = self( model_inputs: BatchEncoding = self(
src_texts, src_texts,
......
...@@ -309,7 +309,7 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -309,7 +309,7 @@ class T5Tokenizer(PreTrainedTokenizer):
**kwargs, **kwargs,
) -> BatchEncoding: ) -> BatchEncoding:
if max_length is None: if max_length is None:
max_length = self.max_len max_length = self.model_max_length
model_inputs = self( model_inputs = self(
src_texts, src_texts,
add_special_tokens=True, add_special_tokens=True,
......
...@@ -226,7 +226,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast): ...@@ -226,7 +226,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
**kwargs, **kwargs,
) -> BatchEncoding: ) -> BatchEncoding:
if max_length is None: if max_length is None:
max_length = self.max_len max_length = self.model_max_length
self.prefix_tokens = [] self.prefix_tokens = []
model_inputs = self( model_inputs = self(
src_texts, src_texts,
......
import tempfile import tempfile
import unittest import unittest
from transformers import ( from transformers import SPIECE_UNDERLINE, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
SPIECE_UNDERLINE,
AutoTokenizer,
BatchEncoding,
MBartTokenizer,
MBartTokenizerFast,
is_torch_available,
)
from transformers.testing_utils import ( from transformers.testing_utils import (
_sentencepiece_available, _sentencepiece_available,
require_sentencepiece, require_sentencepiece,
...@@ -138,7 +131,7 @@ class MBartEnroIntegrationTest(unittest.TestCase): ...@@ -138,7 +131,7 @@ class MBartEnroIntegrationTest(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.tokenizer: MBartTokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name) cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained(cls.checkpoint_name)
cls.pad_token_id = 1 cls.pad_token_id = 1
return cls return cls
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment