Unverified Commit 13285220 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`UdopTokenizer`] Fix post merge imports (#29451)

* update

* ...

* nits

* arf

* 🧼

* beat the last guy

* style everyone
parent fa7f3cf3
...@@ -157,12 +157,6 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -157,12 +157,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
} }
# TODO(PVP) - this should be removed in Transformers v5
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/udop-large": 512,
}
class UdopTokenizer(PreTrainedTokenizer): class UdopTokenizer(PreTrainedTokenizer):
""" """
Adapted from [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on Adapted from [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
...@@ -256,7 +250,6 @@ class UdopTokenizer(PreTrainedTokenizer): ...@@ -256,7 +250,6 @@ class UdopTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
......
...@@ -29,11 +29,6 @@ from ...tokenization_utils_base import ( ...@@ -29,11 +29,6 @@ from ...tokenization_utils_base import (
) )
from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
from ..udop.tokenization_udop import (
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
VOCAB_FILES_NAMES,
)
if is_sentencepiece_available(): if is_sentencepiece_available():
...@@ -42,6 +37,17 @@ else: ...@@ -42,6 +37,17 @@ else:
UdopTokenizer = None UdopTokenizer = None
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/spiece.model",
},
"tokenizer_file": {
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/tokenizer.json",
},
}
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
UDOP_ENCODE_KWARGS_DOCSTRING = r""" UDOP_ENCODE_KWARGS_DOCSTRING = r"""
...@@ -197,7 +203,6 @@ class UdopTokenizerFast(PreTrainedTokenizerFast): ...@@ -197,7 +203,6 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = UdopTokenizer slow_tokenizer_class = UdopTokenizer
......
...@@ -22,12 +22,12 @@ from typing import List ...@@ -22,12 +22,12 @@ from typing import List
from transformers import ( from transformers import (
AddedToken, AddedToken,
SpecialTokensMixin, SpecialTokensMixin,
UdopTokenizer,
UdopTokenizerFast, UdopTokenizerFast,
is_tf_available, is_tf_available,
is_torch_available, is_torch_available,
logging, logging,
) )
from transformers.models.udop.tokenization_udop import UdopTokenizer
from transformers.testing_utils import ( from transformers.testing_utils import (
get_tests_dir, get_tests_dir,
is_pt_tf_cross_test, is_pt_tf_cross_test,
...@@ -1717,6 +1717,10 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -1717,6 +1717,10 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_alignement_methods(self): def test_alignement_methods(self):
pass pass
@unittest.skip("#TODO will be removed in main")
def test_pretrained_model_lists(self):
pass
@unittest.skip("UDOP tokenizer requires boxes besides sequences.") @unittest.skip("UDOP tokenizer requires boxes besides sequences.")
def test_maximum_encoding_length_pair_input(self): def test_maximum_encoding_length_pair_input(self):
pass pass
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment