fix: create a copy for tokenizer object (#18408)

df5e4232 · Yassine · GitHub · 24845aeb · df5e4232
Unverified Commit df5e4232 authored Aug 01, 2022 by Yassine Committed by GitHub Aug 01, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

src/transformers/tokenization_utils_fast.py src/transformers/tokenization_utils_fast.py +2 -1

No files found.
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -16,6 +16,7 @@
 Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
 see tokenization_utils.py
 """
+import copy
 import json
 import os
 from collections import defaultdict
@@ -104,7 +105,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
            )

        if tokenizer_object is not None:
-            fast_tokenizer = tokenizer_object
+            fast_tokenizer = copy.deepcopy(tokenizer_object)
        elif fast_tokenizer_file is not None and not from_slow:
            # We have a serialization from tokenizers which let us directly build the backend
            fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)