Unverified Commit 52746922 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Allow `# Ignore copy` (#27328)



* fix

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent 44b5506d
......@@ -28,7 +28,9 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer,
class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Ignore copy
tokenizer_class = LongformerTokenizer
test_slow_tokenizer = True
rust_tokenizer_class = LongformerTokenizerFast
......@@ -71,23 +73,19 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_tokenizer
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_rust_tokenizer
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_full_tokenizer
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
......@@ -99,7 +97,6 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.roberta_dict_integration_testing with roberta->longformer
def longformer_dict_integration_testing(self):
tokenizer = self.get_tokenizer()
......@@ -110,7 +107,6 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
@slow
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_sequence_builders with roberta-base->allenai/longformer-base-4096
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("allenai/longformer-base-4096")
......@@ -130,7 +126,6 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_space_encoding
def test_space_encoding(self):
tokenizer = self.get_tokenizer()
......@@ -171,11 +166,9 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
self.assertNotEqual(first_char, space_encoding)
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_pretokenized_inputs
def test_pretokenized_inputs(self):
pass
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_embeded_special_tokens
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
......@@ -208,7 +201,6 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_change_add_prefix_space_and_trim_offsets_args
def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
......@@ -223,7 +215,6 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)
# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments
def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
# Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
# `trim_offsets`
......
......@@ -95,13 +95,156 @@ class BertCopyModel(BertCopyPreTrainedModel):
"""
MOCK_DUMMY_BERT_CODE_MATCH = """
class BertDummyModel:
attr_1 = 1
attr_2 = 2
def __init__(self, a=1, b=2):
self.a = a
self.b = b
# Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
def forward(self, c):
return 1
def existing_common(self, c):
return 4
def existing_diff_to_be_ignored(self, c):
return 9
"""
MOCK_DUMMY_ROBERTA_CODE_MATCH = """
# Copied from transformers.models.dummy_bert_match.modeling_dummy_bert_match.BertDummyModel with BertDummy->RobertaBertDummy
class RobertaBertDummyModel:
attr_1 = 1
attr_2 = 2
def __init__(self, a=1, b=2):
self.a = a
self.b = b
# Ignore copy
def only_in_roberta_to_be_ignored(self, c):
return 3
# Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
def forward(self, c):
return 1
def existing_common(self, c):
return 4
# Ignore copy
def existing_diff_to_be_ignored(self, c):
return 6
"""
MOCK_DUMMY_BERT_CODE_NO_MATCH = """
class BertDummyModel:
attr_1 = 1
attr_2 = 2
def __init__(self, a=1, b=2):
self.a = a
self.b = b
# Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
def forward(self, c):
return 1
def only_in_bert(self, c):
return 7
def existing_common(self, c):
return 4
def existing_diff_not_ignored(self, c):
return 8
def existing_diff_to_be_ignored(self, c):
return 9
"""
MOCK_DUMMY_ROBERTA_CODE_NO_MATCH = """
# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
class RobertaBertDummyModel:
attr_1 = 1
attr_2 = 3
def __init__(self, a=1, b=2):
self.a = a
self.b = b
# Ignore copy
def only_in_roberta_to_be_ignored(self, c):
return 3
# Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
def forward(self, c):
return 1
def only_in_roberta_not_ignored(self, c):
return 2
def existing_common(self, c):
return 4
def existing_diff_not_ignored(self, c):
return 5
# Ignore copy
def existing_diff_to_be_ignored(self, c):
return 6
"""
EXPECTED_REPLACED_CODE = """
# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
class RobertaBertDummyModel:
attr_1 = 1
attr_2 = 2
def __init__(self, a=1, b=2):
self.a = a
self.b = b
# Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
def forward(self, c):
return 1
def only_in_bert(self, c):
return 7
def existing_common(self, c):
return 4
def existing_diff_not_ignored(self, c):
return 8
# Ignore copy
def existing_diff_to_be_ignored(self, c):
return 6
# Ignore copy
def only_in_roberta_to_be_ignored(self, c):
return 3
"""
def replace_in_file(filename, old, new):
with open(filename, "r", encoding="utf-8") as f:
content = f.read()
content = content.replace(old, new)
with open(filename, "w", encoding="utf-8") as f:
with open(filename, "w", encoding="utf-8", newline="\n") as f:
f.write(content)
......@@ -117,11 +260,18 @@ def create_tmp_repo(tmp_dir):
model_dir = tmp_dir / "src" / "transformers" / "models"
model_dir.mkdir(parents=True, exist_ok=True)
models = {"bert": MOCK_BERT_CODE, "bertcopy": MOCK_BERT_COPY_CODE}
models = {
"bert": MOCK_BERT_CODE,
"bertcopy": MOCK_BERT_COPY_CODE,
"dummy_bert_match": MOCK_DUMMY_BERT_CODE_MATCH,
"dummy_roberta_match": MOCK_DUMMY_ROBERTA_CODE_MATCH,
"dummy_bert_no_match": MOCK_DUMMY_BERT_CODE_NO_MATCH,
"dummy_roberta_no_match": MOCK_DUMMY_ROBERTA_CODE_NO_MATCH,
}
for model, code in models.items():
model_subdir = model_dir / model
model_subdir.mkdir(exist_ok=True)
with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8") as f:
with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8", newline="\n") as f:
f.write(code)
......@@ -176,11 +326,47 @@ class CopyCheckTester(unittest.TestCase):
diffs = is_copy_consistent(file_to_check)
self.assertEqual(diffs, [["models.bert.modeling_bert.BertModel", 22]])
diffs = is_copy_consistent(file_to_check, overwrite=True)
_ = is_copy_consistent(file_to_check, overwrite=True)
with open(file_to_check, "r", encoding="utf-8") as f:
self.assertEqual(f.read(), MOCK_BERT_COPY_CODE)
def test_is_copy_consistent_with_ignored_match(self):
path_to_check = ["src", "transformers", "models", "dummy_roberta_match", "modeling_dummy_roberta_match.py"]
with tempfile.TemporaryDirectory() as tmp_folder:
# Base check
create_tmp_repo(tmp_folder)
with patch_transformer_repo_path(tmp_folder):
file_to_check = os.path.join(tmp_folder, *path_to_check)
diffs = is_copy_consistent(file_to_check)
self.assertEqual(diffs, [])
def test_is_copy_consistent_with_ignored_no_match(self):
path_to_check = [
"src",
"transformers",
"models",
"dummy_roberta_no_match",
"modeling_dummy_roberta_no_match.py",
]
with tempfile.TemporaryDirectory() as tmp_folder:
# Base check with an inconsistency
create_tmp_repo(tmp_folder)
with patch_transformer_repo_path(tmp_folder):
file_to_check = os.path.join(tmp_folder, *path_to_check)
diffs = is_copy_consistent(file_to_check)
# line 6: `attr_2 = 3` in `MOCK_DUMMY_ROBERTA_CODE_NO_MATCH`.
# (which has a leading `\n`.)
self.assertEqual(
diffs, [["models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel", 6]]
)
_ = is_copy_consistent(file_to_check, overwrite=True)
with open(file_to_check, "r", encoding="utf-8") as f:
self.assertEqual(f.read(), EXPECTED_REPLACED_CODE)
def test_convert_to_localized_md(self):
localized_readme = check_copies.LOCALIZED_READMES["README_zh-hans.md"]
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment