Commit 8af25b16 authored by Aymeric Augustin's avatar Aymeric Augustin
Browse files

Remove six.

parent 6b2200fc
...@@ -18,7 +18,6 @@ from io import open ...@@ -18,7 +18,6 @@ from io import open
import boto3 import boto3
import requests import requests
import six
from botocore.config import Config from botocore.config import Config
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from filelock import FileLock from filelock import FileLock
...@@ -107,33 +106,17 @@ def is_tf_available(): ...@@ -107,33 +106,17 @@ def is_tf_available():
return _tf_available return _tf_available
if not six.PY2: def add_start_docstrings(*docstr):
def add_start_docstrings(*docstr):
def docstring_decorator(fn): def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + fn.__doc__ fn.__doc__ = "".join(docstr) + fn.__doc__
return fn return fn
return docstring_decorator return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + "".join(docstr)
return fn
return docstring_decorator
else: def add_end_docstrings(*docstr):
# Not possible to update class docstrings on python2
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn): def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + "".join(docstr)
return fn return fn
return docstring_decorator return docstring_decorator
...@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): ...@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
if isinstance(user_agent, dict): if isinstance(user_agent, dict):
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
elif isinstance(user_agent, six.string_types): elif isinstance(user_agent, str):
ua += "; " + user_agent ua += "; " + user_agent
headers = {"user-agent": ua} headers = {"user-agent": ua}
if resume_size > 0: if resume_size > 0:
......
...@@ -20,7 +20,6 @@ from os.path import expanduser ...@@ -20,7 +20,6 @@ from os.path import expanduser
from typing import List from typing import List
import requests import requests
import six
from tqdm import tqdm from tqdm import tqdm
...@@ -160,9 +159,6 @@ class TqdmProgressFileReader: ...@@ -160,9 +159,6 @@ class TqdmProgressFileReader:
self.f = f self.f = f
self.total_size = os.fstat(f.fileno()).st_size # type: int self.total_size = os.fstat(f.fileno()).st_size # type: int
self.pbar = tqdm(total=self.total_size, leave=False) self.pbar = tqdm(total=self.total_size, leave=False)
if six.PY3:
# does not work unless PY3
# no big deal as the CLI does not currently support PY2 anyways.
self.read = f.read self.read = f.read
f.read = self._read f.read = self._read
...@@ -182,16 +178,7 @@ class HfFolder: ...@@ -182,16 +178,7 @@ class HfFolder:
""" """
Save token, creating folder as needed. Save token, creating folder as needed.
""" """
if six.PY3:
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True) os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
else:
# Python 2
try:
os.makedirs(os.path.dirname(cls.path_token))
except OSError as e:
if e.errno != os.errno.EEXIST:
raise e
pass
with open(cls.path_token, "w+") as f: with open(cls.path_token, "w+") as f:
f.write(token) f.write(token)
......
...@@ -26,7 +26,6 @@ from os.path import abspath, exists ...@@ -26,7 +26,6 @@ from os.path import abspath, exists
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import six
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
...@@ -939,7 +938,7 @@ def pipeline( ...@@ -939,7 +938,7 @@ def pipeline(
modelcard = config modelcard = config
# Instantiate tokenizer if needed # Instantiate tokenizer if needed
if isinstance(tokenizer, six.string_types): if isinstance(tokenizer, str):
tokenizer = AutoTokenizer.from_pretrained(tokenizer) tokenizer = AutoTokenizer.from_pretrained(tokenizer)
# Instantiate config if needed # Instantiate config if needed
......
...@@ -20,8 +20,6 @@ import os ...@@ -20,8 +20,6 @@ import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
...@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
outputs = inputs outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"') outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode("utf-8")
if not self.keep_accents: if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs) outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
...@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
return outputs return outputs
def _tokenize(self, text, return_unicode=True, sample=False): def _tokenize(self, text, sample=False):
""" Tokenize a string. """ Tokenize a string. """
return_unicode is used only for py2
"""
text = self.preprocess_text(text) text = self.preprocess_text(text)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode): # noqa: F821
text = text.encode("utf-8")
if not sample: if not sample:
pieces = self.sp_model.EncodeAsPieces(text) pieces = self.sp_model.EncodeAsPieces(text)
...@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
else: else:
new_pieces.append(piece) new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode("utf-8")
ret_pieces.append(piece)
new_pieces = ret_pieces
return new_pieces return new_pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.sp_model.PieceToId(token) return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index, return_unicode=True): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index) return self.sp_model.IdToPiece(index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode("utf-8")
return token
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
......
...@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer):
return split_tokens return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token)) return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token) return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
......
...@@ -20,8 +20,6 @@ import logging ...@@ -20,8 +20,6 @@ import logging
import os import os
import unicodedata import unicodedata
import six
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
...@@ -194,9 +192,6 @@ class MecabTokenizer(object): ...@@ -194,9 +192,6 @@ class MecabTokenizer(object):
never_split = self.never_split + (never_split if never_split is not None else []) never_split = self.never_split + (never_split if never_split is not None else [])
tokens = [] tokens = []
if six.PY2:
mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
else:
mecab_output = self.mecab.parse(text) mecab_output = self.mecab.parse(text)
cursor = 0 cursor = 0
......
...@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return self.sp_model.EncodeAsPieces(text) return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
if token in self.fairseq_tokens_to_ids: if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token] return self.fairseq_tokens_to_ids[token]
elif self.sp_model.PieceToId(token) == 0: elif self.sp_model.PieceToId(token) == 0:
...@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return self.fairseq_offset + self.sp_model.PieceToId(token) return self.fairseq_offset + self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens: if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index] return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset) return self.sp_model.IdToPiece(index - self.fairseq_offset)
......
...@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer): ...@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
return split_tokens return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token)) return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token) return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
......
...@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return bpe_tokens return bpe_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token)) return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index) return self.decoder.get(index)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
......
...@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return split_tokens return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token)) return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
......
...@@ -20,8 +20,6 @@ import os ...@@ -20,8 +20,6 @@ import os
import re import re
from shutil import copyfile from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
...@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, return_unicode=True, sample=False): def _tokenize(self, text, sample=False):
""" Take as input a string and return a list of strings (tokens) for words/sub-words """ Take as input a string and return a list of strings (tokens) for words/sub-words
""" """
if not sample: if not sample:
pieces = self.sp_model.EncodeAsPieces(text) pieces = self.sp_model.EncodeAsPieces(text)
else: else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
# convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in pieces:
if isinstance(piece, str):
piece = piece.decode("utf-8")
ret_pieces.append(piece)
pieces = ret_pieces
return pieces return pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
if token.startswith("<extra_id_"): if token.startswith("<extra_id_"):
match = re.match(r"<extra_id_(\d+)>", token) match = re.match(r"<extra_id_(\d+)>", token)
num = int(match.group(1)) num = int(match.group(1))
return self.vocab_size - num - 1 return self.vocab_size - num - 1
return self.sp_model.piece_to_id(token) return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index, return_unicode=True): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
if index < self.sp_model.get_piece_size(): if index < self.sp_model.get_piece_size():
token = self.sp_model.IdToPiece(index) token = self.sp_model.IdToPiece(index)
else: else:
token = "<extra_id_{}>".format(self.vocab_size - 1 - index) token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode("utf-8")
return token return token
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
......
...@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return self.idx2sym[idx] return self.idx2sym[idx]
def _convert_token_to_id(self, sym): def _convert_token_to_id(self, sym):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
if sym in self.sym2idx: if sym in self.sym2idx:
return self.sym2idx[sym] return self.sym2idx[sym]
else: else:
......
...@@ -23,8 +23,6 @@ import os ...@@ -23,8 +23,6 @@ import os
import re import re
from io import open from io import open
import six
from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
...@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object): ...@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object):
for key, value in kwargs.items(): for key, value in kwargs.items():
if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens": if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all( assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
)
else: else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821 assert isinstance(value, str)
setattr(self, key, value) setattr(self, key, value)
@classmethod @classmethod
...@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object): ...@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object):
to_add_tokens = [] to_add_tokens = []
for token in new_tokens: for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) # noqa: F821 assert isinstance(token, str)
if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens: if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
token = token.lower() token = token.lower()
if ( if (
...@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object): ...@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object):
for key, value in special_tokens_dict.items(): for key, value in special_tokens_dict.items():
assert key in self.SPECIAL_TOKENS_ATTRIBUTES assert key in self.SPECIAL_TOKENS_ATTRIBUTES
if key == "additional_special_tokens": if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all( assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
)
added_tokens += self.add_tokens(value) added_tokens += self.add_tokens(value)
else: else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821 assert isinstance(value, str)
added_tokens += self.add_tokens([value]) added_tokens += self.add_tokens([value])
logger.info("Assigning %s to the %s key of the tokenizer", value, key) logger.info("Assigning %s to the %s key of the tokenizer", value, key)
setattr(self, key, value) setattr(self, key, value)
...@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object): ...@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object):
raise NotImplementedError raise NotImplementedError
def convert_tokens_to_ids(self, tokens): def convert_tokens_to_ids(self, tokens):
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id """ Converts a single token, or a sequence of tokens, (str) in a single integer id
(resp. a sequence of ids), using the vocabulary. (resp. a sequence of ids), using the vocabulary.
""" """
if tokens is None: if tokens is None:
return None return None
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): # noqa: F821 if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens) return self._convert_token_to_id_with_added_voc(tokens)
ids = [] ids = []
...@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object): ...@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object):
""" """
def get_input_ids(text): def get_input_ids(text):
if isinstance(text, six.string_types): if isinstance(text, str):
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
return self.convert_tokens_to_ids(text) return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text return text
...@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object): ...@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object):
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
Args: Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
......
...@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer):
return split_tokens return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token)) return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token) return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
......
...@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return self.sp_model.EncodeAsPieces(text) return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
if token in self.fairseq_tokens_to_ids: if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token] return self.fairseq_tokens_to_ids[token]
return self.sp_model.PieceToId(token) + self.fairseq_offset return self.sp_model.PieceToId(token) + self.fairseq_offset
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens: if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index] return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset) return self.sp_model.IdToPiece(index - self.fairseq_offset)
......
...@@ -20,8 +20,6 @@ import os ...@@ -20,8 +20,6 @@ import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
...@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
outputs = inputs outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"') outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode("utf-8")
if not self.keep_accents: if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs) outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
...@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return outputs return outputs
def _tokenize(self, text, return_unicode=True, sample=False): def _tokenize(self, text, sample=False):
""" Tokenize a string. """ Tokenize a string. """
return_unicode is used only for py2
"""
text = self.preprocess_text(text) text = self.preprocess_text(text)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode): # noqa: F821
text = text.encode("utf-8")
if not sample: if not sample:
pieces = self.sp_model.EncodeAsPieces(text) pieces = self.sp_model.EncodeAsPieces(text)
...@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
else: else:
new_pieces.append(piece) new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode("utf-8")
ret_pieces.append(piece)
new_pieces = ret_pieces
return new_pieces return new_pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.sp_model.PieceToId(token) return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index, return_unicode=True): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index) return self.sp_model.IdToPiece(index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode("utf-8")
return token
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
......
...@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer):
return split_tokens return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token)) return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token) return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
......
...@@ -19,7 +19,6 @@ import time ...@@ -19,7 +19,6 @@ import time
import unittest import unittest
import requests import requests
import six
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj from transformers.hf_api import HfApi, HfFolder, PresignedUrl, S3Obj
...@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest): ...@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest):
def test_login_valid(self): def test_login_valid(self):
token = self._api.login(username=USER, password=PASS) token = self._api.login(username=USER, password=PASS)
self.assertIsInstance(token, six.string_types) self.assertIsInstance(token, str)
class HfApiEndpointsTest(HfApiCommonTest): class HfApiEndpointsTest(HfApiCommonTest):
...@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest): ...@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
def test_presign_and_upload(self): def test_presign_and_upload(self):
for FILE_KEY, FILE_PATH in FILES: for FILE_KEY, FILE_PATH in FILES:
access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH) access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH)
self.assertIsInstance(access_url, six.string_types) self.assertIsInstance(access_url, str)
with open(FILE_PATH, "r") as f: with open(FILE_PATH, "r") as f:
body = f.read() body = f.read()
r = requests.get(access_url) r = requests.get(access_url)
......
...@@ -16,8 +16,6 @@ ...@@ -16,8 +16,6 @@
import unittest import unittest
import six
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
from transformers.tokenization_gpt2 import GPT2Tokenizer from transformers.tokenization_gpt2 import GPT2Tokenizer
...@@ -34,9 +32,6 @@ class TokenizerUtilsTest(unittest.TestCase): ...@@ -34,9 +32,6 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertIsInstance(tokenizer, PreTrainedTokenizer) self.assertIsInstance(tokenizer, PreTrainedTokenizer)
for special_tok in tokenizer.all_special_tokens: for special_tok in tokenizer.all_special_tokens:
if six.PY2:
self.assertIsInstance(special_tok, unicode) # noqa: F821
else:
self.assertIsInstance(special_tok, str) self.assertIsInstance(special_tok, str)
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
self.assertIsInstance(special_tok_id, int) self.assertIsInstance(special_tok_id, int)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment