Merge pull request #2270 from aaugustin/remove-python-2

Remove support for Python 2

Merge pull request #2270 from aaugustin/remove-python-2
Remove support for Python 2
ce50305e · Aymeric Augustin · GitHub · b6ea0f43 · 1a948d70 · ce50305e
Unverified Commit ce50305e authored Dec 22, 2019 by Aymeric Augustin Committed by GitHub Dec 22, 2019
20 changed files
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -15,11 +15,10 @@
 # limitations under the License.
 """ PyTorch XLNet model.
 """
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 import math
-import sys
 import torch
 from torch import nn
@@ -420,9 +419,7 @@ class XLNetFeedForward(nn.Module):
        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
        self.dropout = nn.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str) or (
+        if isinstance(config.ff_activation, str):
-            sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)  # noqa: F821
-        ):
            self.activation_function = ACT2FN[config.ff_activation]
        else:
            self.activation_function = config.ff_activation

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Functions and classes related to optimization (weight updates)."""
-from __future__ import absolute_import, division, print_function
 import re

--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import, division, print_function, unicode_literals
 import csv
 import json
@@ -26,7 +26,6 @@ from os.path import abspath, exists
 from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
-import six
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
 from .configuration_utils import PretrainedConfig
@@ -939,7 +938,7 @@ def pipeline(
            modelcard = config
    # Instantiate tokenizer if needed
-    if isinstance(tokenizer, six.string_types):
+    if isinstance(tokenizer, str):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
    # Instantiate config if needed

--- a/src/transformers/tokenization_albert.py
+++ b/src/transformers/tokenization_albert.py
@@ -13,15 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for ALBERT model."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 import os
 import unicodedata
 from shutil import copyfile
-import six
 from .tokenization_utils import PreTrainedTokenizer
@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')
-        if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode("utf-8")
        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
        return outputs
-    def _tokenize(self, text, return_unicode=True, sample=False):
+    def _tokenize(self, text, sample=False):
-        """ Tokenize a string.
+        """ Tokenize a string. """
-            return_unicode is used only for py2
-        """
        text = self.preprocess_text(text)
-        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):  # noqa: F821
-            text = text.encode("utf-8")
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
            else:
                new_pieces.append(piece)
-        # note(zhiliny): convert back to unicode for py2
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in new_pieces:
-                if isinstance(piece, str):
-                    piece = piece.decode("utf-8")
-                ret_pieces.append(piece)
-            new_pieces = ret_pieces
        return new_pieces
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.sp_model.PieceToId(token)
-    def _convert_id_to_token(self, index, return_unicode=True):
+    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
+        return self.sp_model.IdToPiece(index)
-        if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode("utf-8")
-        return token
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""

--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Auto Model class. """
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging

--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -14,13 +14,11 @@
 # limitations under the License.
 """Tokenization classes."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import collections
 import logging
 import os
 import unicodedata
-from io import open
 from .tokenization_utils import PreTrainedTokenizer
@@ -203,11 +201,11 @@ class BertTokenizer(PreTrainedTokenizer):
        return split_tokens
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.vocab.get(token, self.vocab.get(self.unk_token))
    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)
    def convert_tokens_to_string(self, tokens):

--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@@ -14,15 +14,12 @@
 # limitations under the License.
 """Tokenization classes."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import collections
 import logging
 import os
 import unicodedata
-import six
 from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
@@ -195,10 +192,7 @@ class MecabTokenizer(object):
        never_split = self.never_split + (never_split if never_split is not None else [])
        tokens = []
-        if six.PY2:
+        mecab_output = self.mecab.parse(text)
-            mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
-        else:
-            mecab_output = self.mecab.parse(text)
        cursor = 0
        for line in mecab_output.split("\n"):

--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 """ Tokenization classes for Camembert model."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 import os
@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
        return self.sp_model.EncodeAsPieces(text)
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        elif self.sp_model.PieceToId(token) == 0:
@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
        return self.fairseq_offset + self.sp_model.PieceToId(token)
    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -13,12 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for Salesforce CTRL."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 import logging
 import os
-from io import open
 import regex as re
@@ -204,11 +203,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
        return split_tokens
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.encoder.get(token, self.encoder.get(self.unk_token))
    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index, self.unk_token)
    def convert_tokens_to_string(self, tokens):

--- a/src/transformers/tokenization_distilbert.py
+++ b/src/transformers/tokenization_distilbert.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for DistilBERT."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging

--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -13,28 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 import logging
 import os
-import sys
+from functools import lru_cache
-from io import open
 import regex as re
 from .tokenization_utils import PreTrainedTokenizer
-try:
-    from functools import lru_cache
-except ImportError:
-    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
-    def lru_cache():
-        return lambda func: func
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
@@ -80,7 +70,6 @@ def bytes_to_unicode():
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    """
-    _chr = unichr if sys.version_info[0] == 2 else chr  # noqa: F821
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
@@ -91,7 +80,7 @@ def bytes_to_unicode():
            bs.append(b)
            cs.append(2 ** 8 + n)
            n += 1
-    cs = [_chr(n) for n in cs]
+    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
@@ -212,23 +201,18 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
-            if sys.version_info[0] == 2:
+            token = "".join(
-                token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
-                    self.byte_encoder[ord(b)] for b in token
+            )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-            else:
-                token = "".join(
-                    self.byte_encoder[b] for b in token.encode("utf-8")
-                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.encoder.get(token, self.encoder.get(self.unk_token))
    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index)
    def convert_tokens_to_string(self, tokens):

--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -13,13 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 import logging
 import os
 import re
-from io import open
 from .tokenization_bert import BasicTokenizer
 from .tokenization_utils import PreTrainedTokenizer
@@ -177,7 +176,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
        return split_tokens
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.encoder.get(token, self.encoder.get(self.unk_token))
    def _convert_id_to_token(self, index):

--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -13,22 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 from .tokenization_gpt2 import GPT2Tokenizer
-try:
-    from functools import lru_cache
-except ImportError:
-    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
-    def lru_cache():
-        return lambda func: func
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {

--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -14,15 +14,12 @@
 # limitations under the License.
 """ Tokenization class for model T5."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 import os
 import re
 from shutil import copyfile
-import six
 from .tokenization_utils import PreTrainedTokenizer
@@ -138,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
-    def _tokenize(self, text, return_unicode=True, sample=False):
+    def _tokenize(self, text, sample=False):
        """ Take as input a string and return a list of strings (tokens) for words/sub-words
        """
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-        # convert back to unicode for py2
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in pieces:
-                if isinstance(piece, str):
-                    piece = piece.decode("utf-8")
-                ret_pieces.append(piece)
-            pieces = ret_pieces
        return pieces
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        if token.startswith("<extra_id_"):
            match = re.match(r"<extra_id_(\d+)>", token)
            num = int(match.group(1))
            return self.vocab_size - num - 1
        return self.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, index, return_unicode=True):
+    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        if index < self.sp_model.get_piece_size():
            token = self.sp_model.IdToPiece(index)
        else:
            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
-        if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode("utf-8")
        return token
    def convert_tokens_to_string(self, tokens):

--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -16,14 +16,13 @@
 """ Tokenization classes for Transformer XL model.
    Adapted from https://github.com/kimiyoung/transformer-xl.
 """
-from __future__ import absolute_import, division, print_function, unicode_literals
 import glob
 import logging
 import os
-import sys
+import pickle
 from collections import Counter, OrderedDict
-from io import open
 import numpy as np
@@ -36,11 +35,6 @@ try:
 except ImportError:
    pass
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
 logger = logging.getLogger(__name__)
@@ -238,7 +232,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        return self.idx2sym[idx]
    def _convert_token_to_id(self, sym):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        if sym in self.sym2idx:
            return self.sym2idx[sym]
        else:

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import copy
 import itertools
@@ -21,9 +21,6 @@ import json
 import logging
 import os
 import re
-from io import open
-import six
 from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
@@ -251,11 +248,9 @@ class PreTrainedTokenizer(object):
        for key, value in kwargs.items():
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)) and all(
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                        isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
-                    )
                else:
-                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
+                    assert isinstance(value, str)
                setattr(self, key, value)
    @classmethod
@@ -567,7 +562,7 @@ class PreTrainedTokenizer(object):
        to_add_tokens = []
        for token in new_tokens:
-            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))  # noqa: F821
+            assert isinstance(token, str)
            if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
                token = token.lower()
            if (
@@ -649,12 +644,10 @@ class PreTrainedTokenizer(object):
        for key, value in special_tokens_dict.items():
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
            if key == "additional_special_tokens":
-                assert isinstance(value, (list, tuple)) and all(
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                    isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value  # noqa: F821
-                )
                added_tokens += self.add_tokens(value)
            else:
-                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))  # noqa: F821
+                assert isinstance(value, str)
                added_tokens += self.add_tokens([value])
            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
            setattr(self, key, value)
@@ -740,13 +733,13 @@ class PreTrainedTokenizer(object):
        raise NotImplementedError
    def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
+        """ Converts a single token, or a sequence of tokens, (str) in a single integer id
            (resp. a sequence of ids), using the vocabulary.
        """
        if tokens is None:
            return None
-        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):  # noqa: F821
+        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)
        ids = []
@@ -901,9 +894,9 @@ class PreTrainedTokenizer(object):
        """
        def get_input_ids(text):
-            if isinstance(text, six.string_types):
+            if isinstance(text, str):
                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
                return self.convert_tokens_to_ids(text)
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                return text
@@ -1297,7 +1290,7 @@ class PreTrainedTokenizer(object):
    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
            Args:
                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False

--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for XLM."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 import logging
@@ -21,7 +21,6 @@ import os
 import re
 import sys
 import unicodedata
-from io import open
 import sacremoses as sm
@@ -798,11 +797,11 @@ class XLMTokenizer(PreTrainedTokenizer):
        return split_tokens
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.encoder.get(token, self.encoder.get(self.unk_token))
    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index, self.unk_token)
    def convert_tokens_to_string(self, tokens):

--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 """ Tokenization classes for XLM-RoBERTa model."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 import os
@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        return self.sp_model.EncodeAsPieces(text)
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        return self.sp_model.PieceToId(token) + self.fairseq_offset
    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -13,15 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for XLNet model."""
-from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 import os
 import unicodedata
 from shutil import copyfile
-import six
 from .tokenization_utils import PreTrainedTokenizer
@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')
-        if six.PY2 and isinstance(outputs, str):
-            outputs = outputs.decode("utf-8")
        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
        return outputs
-    def _tokenize(self, text, return_unicode=True, sample=False):
+    def _tokenize(self, text, sample=False):
-        """ Tokenize a string.
+        """ Tokenize a string. """
-            return_unicode is used only for py2
-        """
        text = self.preprocess_text(text)
-        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-        if six.PY2 and isinstance(text, unicode):  # noqa: F821
-            text = text.encode("utf-8")
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
            else:
                new_pieces.append(piece)
-        # note(zhiliny): convert back to unicode for py2
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in new_pieces:
-                if isinstance(piece, str):
-                    piece = piece.decode("utf-8")
-                ret_pieces.append(piece)
-            new_pieces = ret_pieces
        return new_pieces
    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
+        """ Converts a token (str) in an id using the vocab. """
        return self.sp_model.PieceToId(token)
-    def _convert_id_to_token(self, index, return_unicode=True):
+    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
+        return self.sp_model.IdToPiece(index)
-        if six.PY2 and return_unicode and isinstance(token, str):
-            token = token.decode("utf-8")
-        return token
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""

--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Finetuning the library models for task XXX."""
-from __future__ import absolute_import, division, print_function
 import argparse
 import glob
@@ -156,7 +155,7 @@ def train(args, train_dataset, model, tokenizer):
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):