Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8af25b16
"ml/backend/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "3a9e8e9fd42f32711b8aeea355e3ed5e155d49b2"
Commit
8af25b16
authored
Dec 22, 2019
by
Aymeric Augustin
Browse files
Remove six.
parent
6b2200fc
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
61 additions
and
167 deletions
+61
-167
src/transformers/file_utils.py
src/transformers/file_utils.py
+11
-28
src/transformers/hf_api.py
src/transformers/hf_api.py
+3
-16
src/transformers/pipelines.py
src/transformers/pipelines.py
+1
-2
src/transformers/tokenization_albert.py
src/transformers/tokenization_albert.py
+6
-28
src/transformers/tokenization_bert.py
src/transformers/tokenization_bert.py
+2
-2
src/transformers/tokenization_bert_japanese.py
src/transformers/tokenization_bert_japanese.py
+1
-6
src/transformers/tokenization_camembert.py
src/transformers/tokenization_camembert.py
+2
-2
src/transformers/tokenization_ctrl.py
src/transformers/tokenization_ctrl.py
+2
-2
src/transformers/tokenization_gpt2.py
src/transformers/tokenization_gpt2.py
+2
-2
src/transformers/tokenization_openai.py
src/transformers/tokenization_openai.py
+1
-1
src/transformers/tokenization_t5.py
src/transformers/tokenization_t5.py
+4
-18
src/transformers/tokenization_transfo_xl.py
src/transformers/tokenization_transfo_xl.py
+1
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+10
-16
src/transformers/tokenization_xlm.py
src/transformers/tokenization_xlm.py
+2
-2
src/transformers/tokenization_xlm_roberta.py
src/transformers/tokenization_xlm_roberta.py
+2
-2
src/transformers/tokenization_xlnet.py
src/transformers/tokenization_xlnet.py
+6
-28
templates/adding_a_new_model/tokenization_xxx.py
templates/adding_a_new_model/tokenization_xxx.py
+2
-2
tests/test_hf_api.py
tests/test_hf_api.py
+2
-3
tests/test_tokenization_utils.py
tests/test_tokenization_utils.py
+1
-6
No files found.
src/transformers/file_utils.py
View file @
8af25b16
...
...
@@ -18,7 +18,6 @@ from io import open
import
boto3
import
requests
import
six
from
botocore.config
import
Config
from
botocore.exceptions
import
ClientError
from
filelock
import
FileLock
...
...
@@ -107,36 +106,20 @@ def is_tf_available():
return
_tf_available
if
not
six
.
PY2
:
def
add_start_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
""
.
join
(
docstr
)
+
fn
.
__doc__
return
fn
def
add_start_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
""
.
join
(
docstr
)
+
fn
.
__doc__
return
fn
return
docstring_decorator
return
docstring_decorator
def
add_end_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
""
.
join
(
docstr
)
return
fn
def
add_end_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
""
.
join
(
docstr
)
return
fn
return
docstring_decorator
else
:
# Not possible to update class docstrings on python2
def
add_start_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
return
fn
return
docstring_decorator
def
add_end_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
return
fn
return
docstring_decorator
return
docstring_decorator
def
is_remote_url
(
url_or_filename
):
...
...
@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
ua
=
"transformers/{}; python/{}"
.
format
(
__version__
,
sys
.
version
.
split
()[
0
])
if
isinstance
(
user_agent
,
dict
):
ua
+=
"; "
+
"; "
.
join
(
"{}/{}"
.
format
(
k
,
v
)
for
k
,
v
in
user_agent
.
items
())
elif
isinstance
(
user_agent
,
s
ix
.
string_types
):
elif
isinstance
(
user_agent
,
s
tr
):
ua
+=
"; "
+
user_agent
headers
=
{
"user-agent"
:
ua
}
if
resume_size
>
0
:
...
...
src/transformers/hf_api.py
View file @
8af25b16
...
...
@@ -20,7 +20,6 @@ from os.path import expanduser
from
typing
import
List
import
requests
import
six
from
tqdm
import
tqdm
...
...
@@ -160,11 +159,8 @@ class TqdmProgressFileReader:
self
.
f
=
f
self
.
total_size
=
os
.
fstat
(
f
.
fileno
()).
st_size
# type: int
self
.
pbar
=
tqdm
(
total
=
self
.
total_size
,
leave
=
False
)
if
six
.
PY3
:
# does not work unless PY3
# no big deal as the CLI does not currently support PY2 anyways.
self
.
read
=
f
.
read
f
.
read
=
self
.
_read
self
.
read
=
f
.
read
f
.
read
=
self
.
_read
def
_read
(
self
,
n
=-
1
):
self
.
pbar
.
update
(
n
)
...
...
@@ -182,16 +178,7 @@ class HfFolder:
"""
Save token, creating folder as needed.
"""
if
six
.
PY3
:
os
.
makedirs
(
os
.
path
.
dirname
(
cls
.
path_token
),
exist_ok
=
True
)
else
:
# Python 2
try
:
os
.
makedirs
(
os
.
path
.
dirname
(
cls
.
path_token
))
except
OSError
as
e
:
if
e
.
errno
!=
os
.
errno
.
EEXIST
:
raise
e
pass
os
.
makedirs
(
os
.
path
.
dirname
(
cls
.
path_token
),
exist_ok
=
True
)
with
open
(
cls
.
path_token
,
"w+"
)
as
f
:
f
.
write
(
token
)
...
...
src/transformers/pipelines.py
View file @
8af25b16
...
...
@@ -26,7 +26,6 @@ from os.path import abspath, exists
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
import
numpy
as
np
import
six
from
.configuration_auto
import
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
AutoConfig
from
.configuration_utils
import
PretrainedConfig
...
...
@@ -939,7 +938,7 @@ def pipeline(
modelcard
=
config
# Instantiate tokenizer if needed
if
isinstance
(
tokenizer
,
s
ix
.
string_types
):
if
isinstance
(
tokenizer
,
s
tr
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer
)
# Instantiate config if needed
...
...
src/transformers/tokenization_albert.py
View file @
8af25b16
...
...
@@ -20,8 +20,6 @@ import os
import
unicodedata
from
shutil
import
copyfile
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
...
...
@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
"utf-8"
)
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
...
...
@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
return_unicode is used only for py2
"""
def
_tokenize
(
self
,
text
,
sample
=
False
):
""" Tokenize a string. """
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
# noqa: F821
text
=
text
.
encode
(
"utf-8"
)
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
...
...
@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
else
:
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
"utf-8"
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
"utf-8"
)
return
token
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
sp_model
.
IdToPiece
(
index
)
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
...
...
src/transformers/tokenization_bert.py
View file @
8af25b16
...
...
@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer):
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_bert_japanese.py
View file @
8af25b16
...
...
@@ -20,8 +20,6 @@ import logging
import
os
import
unicodedata
import
six
from
.tokenization_bert
import
BasicTokenizer
,
BertTokenizer
,
WordpieceTokenizer
,
load_vocab
...
...
@@ -194,10 +192,7 @@ class MecabTokenizer(object):
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
tokens
=
[]
if
six
.
PY2
:
mecab_output
=
self
.
mecab
.
parse
(
text
.
encode
(
"utf-8"
)).
decode
(
"utf-8"
)
else
:
mecab_output
=
self
.
mecab
.
parse
(
text
)
mecab_output
=
self
.
mecab
.
parse
(
text
)
cursor
=
0
for
line
in
mecab_output
.
split
(
"
\n
"
):
...
...
src/transformers/tokenization_camembert.py
View file @
8af25b16
...
...
@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
elif
self
.
sp_model
.
PieceToId
(
token
)
==
0
:
...
...
@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
...
...
src/transformers/tokenization_ctrl.py
View file @
8af25b16
...
...
@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_gpt2.py
View file @
8af25b16
...
...
@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return
bpe_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
decoder
.
get
(
index
)
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_openai.py
View file @
8af25b16
...
...
@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
...
...
src/transformers/tokenization_t5.py
View file @
8af25b16
...
...
@@ -20,8 +20,6 @@ import os
import
re
from
shutil
import
copyfile
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
...
...
@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
def
_tokenize
(
self
,
text
,
sample
=
False
):
""" Take as input a string and return a list of strings (tokens) for words/sub-words
"""
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
# convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
"utf-8"
)
ret_pieces
.
append
(
piece
)
pieces
=
ret_pieces
return
pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
token
.
startswith
(
"<extra_id_"
):
match
=
re
.
match
(
r
"<extra_id_(\d+)>"
,
token
)
num
=
int
(
match
.
group
(
1
))
return
self
.
vocab_size
-
num
-
1
return
self
.
sp_model
.
piece_to_id
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
<
self
.
sp_model
.
get_piece_size
():
token
=
self
.
sp_model
.
IdToPiece
(
index
)
else
:
token
=
"<extra_id_{}>"
.
format
(
self
.
vocab_size
-
1
-
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
"utf-8"
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_transfo_xl.py
View file @
8af25b16
...
...
@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return
self
.
idx2sym
[
idx
]
def
_convert_token_to_id
(
self
,
sym
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
sym
in
self
.
sym2idx
:
return
self
.
sym2idx
[
sym
]
else
:
...
...
src/transformers/tokenization_utils.py
View file @
8af25b16
...
...
@@ -23,8 +23,6 @@ import os
import
re
from
io
import
open
import
six
from
.file_utils
import
cached_path
,
hf_bucket_url
,
is_remote_url
,
is_tf_available
,
is_torch_available
...
...
@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object):
for
key
,
value
in
kwargs
.
items
():
if
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
if
key
==
"additional_special_tokens"
:
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
isinstance
(
t
,
str
)
or
(
six
.
PY2
and
isinstance
(
t
,
unicode
))
for
t
in
value
# noqa: F821
)
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
isinstance
(
t
,
str
)
for
t
in
value
)
else
:
assert
isinstance
(
value
,
str
)
or
(
six
.
PY2
and
isinstance
(
value
,
unicode
))
# noqa: F821
assert
isinstance
(
value
,
str
)
setattr
(
self
,
key
,
value
)
@
classmethod
...
...
@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object):
to_add_tokens
=
[]
for
token
in
new_tokens
:
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
# noqa: F821
assert
isinstance
(
token
,
str
)
if
self
.
init_kwargs
.
get
(
"do_lower_case"
,
False
)
and
token
not
in
self
.
all_special_tokens
:
token
=
token
.
lower
()
if
(
...
...
@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object):
for
key
,
value
in
special_tokens_dict
.
items
():
assert
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
if
key
==
"additional_special_tokens"
:
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
isinstance
(
t
,
str
)
or
(
six
.
PY2
and
isinstance
(
t
,
unicode
))
for
t
in
value
# noqa: F821
)
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
isinstance
(
t
,
str
)
for
t
in
value
)
added_tokens
+=
self
.
add_tokens
(
value
)
else
:
assert
isinstance
(
value
,
str
)
or
(
six
.
PY2
and
isinstance
(
value
,
unicode
))
# noqa: F821
assert
isinstance
(
value
,
str
)
added_tokens
+=
self
.
add_tokens
([
value
])
logger
.
info
(
"Assigning %s to the %s key of the tokenizer"
,
value
,
key
)
setattr
(
self
,
key
,
value
)
...
...
@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object):
raise
NotImplementedError
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a single token, or a sequence of tokens, (str
/unicode
) in a single integer id
""" Converts a single token, or a sequence of tokens, (str) in a single integer id
(resp. a sequence of ids), using the vocabulary.
"""
if
tokens
is
None
:
return
None
if
isinstance
(
tokens
,
str
)
or
(
six
.
PY2
and
isinstance
(
tokens
,
unicode
)):
# noqa: F821
if
isinstance
(
tokens
,
str
)
:
return
self
.
_convert_token_to_id_with_added_voc
(
tokens
)
ids
=
[]
...
...
@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object):
"""
def
get_input_ids
(
text
):
if
isinstance
(
text
,
s
ix
.
string_types
):
if
isinstance
(
text
,
s
tr
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
s
ix
.
string_types
):
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
s
tr
):
return
self
.
convert_tokens_to_ids
(
text
)
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
int
):
return
text
...
...
@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str
/unicode
), using the vocabulary and added tokens.
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
...
...
src/transformers/tokenization_xlm.py
View file @
8af25b16
...
...
@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer):
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_xlm_roberta.py
View file @
8af25b16
...
...
@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
sp_model
.
PieceToId
(
token
)
+
self
.
fairseq_offset
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
...
...
src/transformers/tokenization_xlnet.py
View file @
8af25b16
...
...
@@ -20,8 +20,6 @@ import os
import
unicodedata
from
shutil
import
copyfile
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
...
...
@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
"utf-8"
)
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
...
...
@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
return_unicode is used only for py2
"""
def
_tokenize
(
self
,
text
,
sample
=
False
):
""" Tokenize a string. """
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
# noqa: F821
text
=
text
.
encode
(
"utf-8"
)
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
...
...
@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
else
:
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
"utf-8"
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
"utf-8"
)
return
token
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
sp_model
.
IdToPiece
(
index
)
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
...
...
templates/adding_a_new_model/tokenization_xxx.py
View file @
8af25b16
...
...
@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer):
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
tests/test_hf_api.py
View file @
8af25b16
...
...
@@ -19,7 +19,6 @@ import time
import
unittest
import
requests
import
six
from
requests.exceptions
import
HTTPError
from
transformers.hf_api
import
HfApi
,
HfFolder
,
PresignedUrl
,
S3Obj
...
...
@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest):
def
test_login_valid
(
self
):
token
=
self
.
_api
.
login
(
username
=
USER
,
password
=
PASS
)
self
.
assertIsInstance
(
token
,
s
ix
.
string_types
)
self
.
assertIsInstance
(
token
,
s
tr
)
class
HfApiEndpointsTest
(
HfApiCommonTest
):
...
...
@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
def
test_presign_and_upload
(
self
):
for
FILE_KEY
,
FILE_PATH
in
FILES
:
access_url
=
self
.
_api
.
presign_and_upload
(
token
=
self
.
_token
,
filename
=
FILE_KEY
,
filepath
=
FILE_PATH
)
self
.
assertIsInstance
(
access_url
,
s
ix
.
string_types
)
self
.
assertIsInstance
(
access_url
,
s
tr
)
with
open
(
FILE_PATH
,
"r"
)
as
f
:
body
=
f
.
read
()
r
=
requests
.
get
(
access_url
)
...
...
tests/test_tokenization_utils.py
View file @
8af25b16
...
...
@@ -16,8 +16,6 @@
import
unittest
import
six
from
transformers
import
PreTrainedTokenizer
from
transformers.tokenization_gpt2
import
GPT2Tokenizer
...
...
@@ -34,10 +32,7 @@ class TokenizerUtilsTest(unittest.TestCase):
self
.
assertIsInstance
(
tokenizer
,
PreTrainedTokenizer
)
for
special_tok
in
tokenizer
.
all_special_tokens
:
if
six
.
PY2
:
self
.
assertIsInstance
(
special_tok
,
unicode
)
# noqa: F821
else
:
self
.
assertIsInstance
(
special_tok
,
str
)
self
.
assertIsInstance
(
special_tok
,
str
)
special_tok_id
=
tokenizer
.
convert_tokens_to_ids
(
special_tok
)
self
.
assertIsInstance
(
special_tok_id
,
int
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment