Copy object instead of passing the reference

21734901 · Lysandre · adb8c931 · 21734901 · 21734901
Commit 21734901 authored Jan 29, 2020 by Lysandre
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 1 deletion

src/transformers/tokenization_utils.py src/transformers/tokenization_utils.py +1 -1

tests/test_tokenization_common.py tests/test_tokenization_common.py +13 -0

No files found.
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -326,7 +326,7 @@ class PreTrainedTokenizer(object):
                cls.pretrained_init_configuration
                and pretrained_model_name_or_path in cls.pretrained_init_configuration
            ):
-                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
+                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
        else:
            # Get the vocabulary from local files
            logger.info(

--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -495,3 +495,16 @@ class TokenizerTesterMixin:
        assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
        assert [0] * padding_size + attention_mask == padded_attention_mask
        assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
+
+    def test_separate_tokenizers(self):
+        # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
+        # we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
+
+        tokenizer = self.get_tokenizer(random_argument=True)
+        print(tokenizer.init_kwargs)
+        assert tokenizer.init_kwargs['random_argument'] is True
+        new_tokenizer = self.get_tokenizer(random_argument=False)
+        print(tokenizer.init_kwargs)
+        print(new_tokenizer.init_kwargs)
+        assert tokenizer.init_kwargs['random_argument'] is True
+        assert new_tokenizer.init_kwargs['random_argument'] is False