Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8af25b16
Commit
8af25b16
authored
Dec 22, 2019
by
Aymeric Augustin
Browse files
Remove six.
parent
6b2200fc
Changes
19
Show whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
61 additions
and
167 deletions
+61
-167
src/transformers/file_utils.py
src/transformers/file_utils.py
+11
-28
src/transformers/hf_api.py
src/transformers/hf_api.py
+3
-16
src/transformers/pipelines.py
src/transformers/pipelines.py
+1
-2
src/transformers/tokenization_albert.py
src/transformers/tokenization_albert.py
+6
-28
src/transformers/tokenization_bert.py
src/transformers/tokenization_bert.py
+2
-2
src/transformers/tokenization_bert_japanese.py
src/transformers/tokenization_bert_japanese.py
+1
-6
src/transformers/tokenization_camembert.py
src/transformers/tokenization_camembert.py
+2
-2
src/transformers/tokenization_ctrl.py
src/transformers/tokenization_ctrl.py
+2
-2
src/transformers/tokenization_gpt2.py
src/transformers/tokenization_gpt2.py
+2
-2
src/transformers/tokenization_openai.py
src/transformers/tokenization_openai.py
+1
-1
src/transformers/tokenization_t5.py
src/transformers/tokenization_t5.py
+4
-18
src/transformers/tokenization_transfo_xl.py
src/transformers/tokenization_transfo_xl.py
+1
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+10
-16
src/transformers/tokenization_xlm.py
src/transformers/tokenization_xlm.py
+2
-2
src/transformers/tokenization_xlm_roberta.py
src/transformers/tokenization_xlm_roberta.py
+2
-2
src/transformers/tokenization_xlnet.py
src/transformers/tokenization_xlnet.py
+6
-28
templates/adding_a_new_model/tokenization_xxx.py
templates/adding_a_new_model/tokenization_xxx.py
+2
-2
tests/test_hf_api.py
tests/test_hf_api.py
+2
-3
tests/test_tokenization_utils.py
tests/test_tokenization_utils.py
+1
-6
No files found.
src/transformers/file_utils.py
View file @
8af25b16
...
@@ -18,7 +18,6 @@ from io import open
...
@@ -18,7 +18,6 @@ from io import open
import
boto3
import
boto3
import
requests
import
requests
import
six
from
botocore.config
import
Config
from
botocore.config
import
Config
from
botocore.exceptions
import
ClientError
from
botocore.exceptions
import
ClientError
from
filelock
import
FileLock
from
filelock
import
FileLock
...
@@ -107,33 +106,17 @@ def is_tf_available():
...
@@ -107,33 +106,17 @@ def is_tf_available():
return
_tf_available
return
_tf_available
if
not
six
.
PY2
:
def
add_start_docstrings
(
*
docstr
):
def
add_start_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
""
.
join
(
docstr
)
+
fn
.
__doc__
fn
.
__doc__
=
""
.
join
(
docstr
)
+
fn
.
__doc__
return
fn
return
fn
return
docstring_decorator
return
docstring_decorator
def
add_end_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
""
.
join
(
docstr
)
return
fn
return
docstring_decorator
else
:
def
add_end_docstrings
(
*
docstr
):
# Not possible to update class docstrings on python2
def
add_start_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
return
fn
return
docstring_decorator
def
add_end_docstrings
(
*
docstr
):
def
docstring_decorator
(
fn
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
""
.
join
(
docstr
)
return
fn
return
fn
return
docstring_decorator
return
docstring_decorator
...
@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
...
@@ -297,7 +280,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
ua
=
"transformers/{}; python/{}"
.
format
(
__version__
,
sys
.
version
.
split
()[
0
])
ua
=
"transformers/{}; python/{}"
.
format
(
__version__
,
sys
.
version
.
split
()[
0
])
if
isinstance
(
user_agent
,
dict
):
if
isinstance
(
user_agent
,
dict
):
ua
+=
"; "
+
"; "
.
join
(
"{}/{}"
.
format
(
k
,
v
)
for
k
,
v
in
user_agent
.
items
())
ua
+=
"; "
+
"; "
.
join
(
"{}/{}"
.
format
(
k
,
v
)
for
k
,
v
in
user_agent
.
items
())
elif
isinstance
(
user_agent
,
s
ix
.
string_types
):
elif
isinstance
(
user_agent
,
s
tr
):
ua
+=
"; "
+
user_agent
ua
+=
"; "
+
user_agent
headers
=
{
"user-agent"
:
ua
}
headers
=
{
"user-agent"
:
ua
}
if
resume_size
>
0
:
if
resume_size
>
0
:
...
...
src/transformers/hf_api.py
View file @
8af25b16
...
@@ -20,7 +20,6 @@ from os.path import expanduser
...
@@ -20,7 +20,6 @@ from os.path import expanduser
from
typing
import
List
from
typing
import
List
import
requests
import
requests
import
six
from
tqdm
import
tqdm
from
tqdm
import
tqdm
...
@@ -160,9 +159,6 @@ class TqdmProgressFileReader:
...
@@ -160,9 +159,6 @@ class TqdmProgressFileReader:
self
.
f
=
f
self
.
f
=
f
self
.
total_size
=
os
.
fstat
(
f
.
fileno
()).
st_size
# type: int
self
.
total_size
=
os
.
fstat
(
f
.
fileno
()).
st_size
# type: int
self
.
pbar
=
tqdm
(
total
=
self
.
total_size
,
leave
=
False
)
self
.
pbar
=
tqdm
(
total
=
self
.
total_size
,
leave
=
False
)
if
six
.
PY3
:
# does not work unless PY3
# no big deal as the CLI does not currently support PY2 anyways.
self
.
read
=
f
.
read
self
.
read
=
f
.
read
f
.
read
=
self
.
_read
f
.
read
=
self
.
_read
...
@@ -182,16 +178,7 @@ class HfFolder:
...
@@ -182,16 +178,7 @@ class HfFolder:
"""
"""
Save token, creating folder as needed.
Save token, creating folder as needed.
"""
"""
if
six
.
PY3
:
os
.
makedirs
(
os
.
path
.
dirname
(
cls
.
path_token
),
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
dirname
(
cls
.
path_token
),
exist_ok
=
True
)
else
:
# Python 2
try
:
os
.
makedirs
(
os
.
path
.
dirname
(
cls
.
path_token
))
except
OSError
as
e
:
if
e
.
errno
!=
os
.
errno
.
EEXIST
:
raise
e
pass
with
open
(
cls
.
path_token
,
"w+"
)
as
f
:
with
open
(
cls
.
path_token
,
"w+"
)
as
f
:
f
.
write
(
token
)
f
.
write
(
token
)
...
...
src/transformers/pipelines.py
View file @
8af25b16
...
@@ -26,7 +26,6 @@ from os.path import abspath, exists
...
@@ -26,7 +26,6 @@ from os.path import abspath, exists
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
import
numpy
as
np
import
numpy
as
np
import
six
from
.configuration_auto
import
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
AutoConfig
from
.configuration_auto
import
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
AutoConfig
from
.configuration_utils
import
PretrainedConfig
from
.configuration_utils
import
PretrainedConfig
...
@@ -939,7 +938,7 @@ def pipeline(
...
@@ -939,7 +938,7 @@ def pipeline(
modelcard
=
config
modelcard
=
config
# Instantiate tokenizer if needed
# Instantiate tokenizer if needed
if
isinstance
(
tokenizer
,
s
ix
.
string_types
):
if
isinstance
(
tokenizer
,
s
tr
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer
)
# Instantiate config if needed
# Instantiate config if needed
...
...
src/transformers/tokenization_albert.py
View file @
8af25b16
...
@@ -20,8 +20,6 @@ import os
...
@@ -20,8 +20,6 @@ import os
import
unicodedata
import
unicodedata
from
shutil
import
copyfile
from
shutil
import
copyfile
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
...
@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
...
@@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
outputs
=
inputs
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
"utf-8"
)
if
not
self
.
keep_accents
:
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
...
@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
...
@@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
return
outputs
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
def
_tokenize
(
self
,
text
,
sample
=
False
):
""" Tokenize a string.
""" Tokenize a string. """
return_unicode is used only for py2
"""
text
=
self
.
preprocess_text
(
text
)
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
# noqa: F821
text
=
text
.
encode
(
"utf-8"
)
if
not
sample
:
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
...
@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
...
@@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
else
:
else
:
new_pieces
.
append
(
piece
)
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
"utf-8"
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
return
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
"utf-8"
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
...
...
src/transformers/tokenization_bert.py
View file @
8af25b16
...
@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -202,11 +202,11 @@ class BertTokenizer(PreTrainedTokenizer):
return
split_tokens
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_bert_japanese.py
View file @
8af25b16
...
@@ -20,8 +20,6 @@ import logging
...
@@ -20,8 +20,6 @@ import logging
import
os
import
os
import
unicodedata
import
unicodedata
import
six
from
.tokenization_bert
import
BasicTokenizer
,
BertTokenizer
,
WordpieceTokenizer
,
load_vocab
from
.tokenization_bert
import
BasicTokenizer
,
BertTokenizer
,
WordpieceTokenizer
,
load_vocab
...
@@ -194,9 +192,6 @@ class MecabTokenizer(object):
...
@@ -194,9 +192,6 @@ class MecabTokenizer(object):
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
tokens
=
[]
tokens
=
[]
if
six
.
PY2
:
mecab_output
=
self
.
mecab
.
parse
(
text
.
encode
(
"utf-8"
)).
decode
(
"utf-8"
)
else
:
mecab_output
=
self
.
mecab
.
parse
(
text
)
mecab_output
=
self
.
mecab
.
parse
(
text
)
cursor
=
0
cursor
=
0
...
...
src/transformers/tokenization_camembert.py
View file @
8af25b16
...
@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
fairseq_tokens_to_ids
[
token
]
elif
self
.
sp_model
.
PieceToId
(
token
)
==
0
:
elif
self
.
sp_model
.
PieceToId
(
token
)
==
0
:
...
@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
return
self
.
fairseq_offset
+
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
...
...
src/transformers/tokenization_ctrl.py
View file @
8af25b16
...
@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
...
@@ -204,11 +204,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
return
split_tokens
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_gpt2.py
View file @
8af25b16
...
@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -224,11 +224,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return
bpe_tokens
return
bpe_tokens
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
decoder
.
get
(
index
)
return
self
.
decoder
.
get
(
index
)
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_openai.py
View file @
8af25b16
...
@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
...
@@ -177,7 +177,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return
split_tokens
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
...
...
src/transformers/tokenization_t5.py
View file @
8af25b16
...
@@ -20,8 +20,6 @@ import os
...
@@ -20,8 +20,6 @@ import os
import
re
import
re
from
shutil
import
copyfile
from
shutil
import
copyfile
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
...
@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
...
@@ -137,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
def
_tokenize
(
self
,
text
,
sample
=
False
):
""" Take as input a string and return a list of strings (tokens) for words/sub-words
""" Take as input a string and return a list of strings (tokens) for words/sub-words
"""
"""
if
not
sample
:
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
# convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
"utf-8"
)
ret_pieces
.
append
(
piece
)
pieces
=
ret_pieces
return
pieces
return
pieces
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
token
.
startswith
(
"<extra_id_"
):
if
token
.
startswith
(
"<extra_id_"
):
match
=
re
.
match
(
r
"<extra_id_(\d+)>"
,
token
)
match
=
re
.
match
(
r
"<extra_id_(\d+)>"
,
token
)
num
=
int
(
match
.
group
(
1
))
num
=
int
(
match
.
group
(
1
))
return
self
.
vocab_size
-
num
-
1
return
self
.
vocab_size
-
num
-
1
return
self
.
sp_model
.
piece_to_id
(
token
)
return
self
.
sp_model
.
piece_to_id
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
<
self
.
sp_model
.
get_piece_size
():
if
index
<
self
.
sp_model
.
get_piece_size
():
token
=
self
.
sp_model
.
IdToPiece
(
index
)
token
=
self
.
sp_model
.
IdToPiece
(
index
)
else
:
else
:
token
=
"<extra_id_{}>"
.
format
(
self
.
vocab_size
-
1
-
index
)
token
=
"<extra_id_{}>"
.
format
(
self
.
vocab_size
-
1
-
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
"utf-8"
)
return
token
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_transfo_xl.py
View file @
8af25b16
...
@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
...
@@ -238,7 +238,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return
self
.
idx2sym
[
idx
]
return
self
.
idx2sym
[
idx
]
def
_convert_token_to_id
(
self
,
sym
):
def
_convert_token_to_id
(
self
,
sym
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
sym
in
self
.
sym2idx
:
if
sym
in
self
.
sym2idx
:
return
self
.
sym2idx
[
sym
]
return
self
.
sym2idx
[
sym
]
else
:
else
:
...
...
src/transformers/tokenization_utils.py
View file @
8af25b16
...
@@ -23,8 +23,6 @@ import os
...
@@ -23,8 +23,6 @@ import os
import
re
import
re
from
io
import
open
from
io
import
open
import
six
from
.file_utils
import
cached_path
,
hf_bucket_url
,
is_remote_url
,
is_tf_available
,
is_torch_available
from
.file_utils
import
cached_path
,
hf_bucket_url
,
is_remote_url
,
is_tf_available
,
is_torch_available
...
@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object):
...
@@ -251,11 +249,9 @@ class PreTrainedTokenizer(object):
for
key
,
value
in
kwargs
.
items
():
for
key
,
value
in
kwargs
.
items
():
if
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
if
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
if
key
==
"additional_special_tokens"
:
if
key
==
"additional_special_tokens"
:
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
isinstance
(
t
,
str
)
for
t
in
value
)
isinstance
(
t
,
str
)
or
(
six
.
PY2
and
isinstance
(
t
,
unicode
))
for
t
in
value
# noqa: F821
)
else
:
else
:
assert
isinstance
(
value
,
str
)
or
(
six
.
PY2
and
isinstance
(
value
,
unicode
))
# noqa: F821
assert
isinstance
(
value
,
str
)
setattr
(
self
,
key
,
value
)
setattr
(
self
,
key
,
value
)
@
classmethod
@
classmethod
...
@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object):
...
@@ -567,7 +563,7 @@ class PreTrainedTokenizer(object):
to_add_tokens
=
[]
to_add_tokens
=
[]
for
token
in
new_tokens
:
for
token
in
new_tokens
:
assert
isinstance
(
token
,
str
)
or
(
six
.
PY2
and
isinstance
(
token
,
unicode
))
# noqa: F821
assert
isinstance
(
token
,
str
)
if
self
.
init_kwargs
.
get
(
"do_lower_case"
,
False
)
and
token
not
in
self
.
all_special_tokens
:
if
self
.
init_kwargs
.
get
(
"do_lower_case"
,
False
)
and
token
not
in
self
.
all_special_tokens
:
token
=
token
.
lower
()
token
=
token
.
lower
()
if
(
if
(
...
@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object):
...
@@ -649,12 +645,10 @@ class PreTrainedTokenizer(object):
for
key
,
value
in
special_tokens_dict
.
items
():
for
key
,
value
in
special_tokens_dict
.
items
():
assert
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
assert
key
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
if
key
==
"additional_special_tokens"
:
if
key
==
"additional_special_tokens"
:
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
assert
isinstance
(
value
,
(
list
,
tuple
))
and
all
(
isinstance
(
t
,
str
)
for
t
in
value
)
isinstance
(
t
,
str
)
or
(
six
.
PY2
and
isinstance
(
t
,
unicode
))
for
t
in
value
# noqa: F821
)
added_tokens
+=
self
.
add_tokens
(
value
)
added_tokens
+=
self
.
add_tokens
(
value
)
else
:
else
:
assert
isinstance
(
value
,
str
)
or
(
six
.
PY2
and
isinstance
(
value
,
unicode
))
# noqa: F821
assert
isinstance
(
value
,
str
)
added_tokens
+=
self
.
add_tokens
([
value
])
added_tokens
+=
self
.
add_tokens
([
value
])
logger
.
info
(
"Assigning %s to the %s key of the tokenizer"
,
value
,
key
)
logger
.
info
(
"Assigning %s to the %s key of the tokenizer"
,
value
,
key
)
setattr
(
self
,
key
,
value
)
setattr
(
self
,
key
,
value
)
...
@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object):
...
@@ -740,13 +734,13 @@ class PreTrainedTokenizer(object):
raise
NotImplementedError
raise
NotImplementedError
def
convert_tokens_to_ids
(
self
,
tokens
):
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a single token, or a sequence of tokens, (str
/unicode
) in a single integer id
""" Converts a single token, or a sequence of tokens, (str) in a single integer id
(resp. a sequence of ids), using the vocabulary.
(resp. a sequence of ids), using the vocabulary.
"""
"""
if
tokens
is
None
:
if
tokens
is
None
:
return
None
return
None
if
isinstance
(
tokens
,
str
)
or
(
six
.
PY2
and
isinstance
(
tokens
,
unicode
)):
# noqa: F821
if
isinstance
(
tokens
,
str
)
:
return
self
.
_convert_token_to_id_with_added_voc
(
tokens
)
return
self
.
_convert_token_to_id_with_added_voc
(
tokens
)
ids
=
[]
ids
=
[]
...
@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object):
...
@@ -901,9 +895,9 @@ class PreTrainedTokenizer(object):
"""
"""
def
get_input_ids
(
text
):
def
get_input_ids
(
text
):
if
isinstance
(
text
,
s
ix
.
string_types
):
if
isinstance
(
text
,
s
tr
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
s
ix
.
string_types
):
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
s
tr
):
return
self
.
convert_tokens_to_ids
(
text
)
return
self
.
convert_tokens_to_ids
(
text
)
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
int
):
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
int
):
return
text
return
text
...
@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object):
...
@@ -1297,7 +1291,7 @@ class PreTrainedTokenizer(object):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str
/unicode
), using the vocabulary and added tokens.
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
Args:
Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
...
...
src/transformers/tokenization_xlm.py
View file @
8af25b16
...
@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -798,11 +798,11 @@ class XLMTokenizer(PreTrainedTokenizer):
return
split_tokens
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
src/transformers/tokenization_xlm_roberta.py
View file @
8af25b16
...
@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
...
@@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
return
self
.
sp_model
.
EncodeAsPieces
(
text
)
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if
token
in
self
.
fairseq_tokens_to_ids
:
if
token
in
self
.
fairseq_tokens_to_ids
:
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
fairseq_tokens_to_ids
[
token
]
return
self
.
sp_model
.
PieceToId
(
token
)
+
self
.
fairseq_offset
return
self
.
sp_model
.
PieceToId
(
token
)
+
self
.
fairseq_offset
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
in
self
.
fairseq_ids_to_tokens
:
if
index
in
self
.
fairseq_ids_to_tokens
:
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
fairseq_ids_to_tokens
[
index
]
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
return
self
.
sp_model
.
IdToPiece
(
index
-
self
.
fairseq_offset
)
...
...
src/transformers/tokenization_xlnet.py
View file @
8af25b16
...
@@ -20,8 +20,6 @@ import os
...
@@ -20,8 +20,6 @@ import os
import
unicodedata
import
unicodedata
from
shutil
import
copyfile
from
shutil
import
copyfile
import
six
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils
import
PreTrainedTokenizer
...
@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
outputs
=
inputs
outputs
=
inputs
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
outputs
=
outputs
.
replace
(
"``"
,
'"'
).
replace
(
"''"
,
'"'
)
if
six
.
PY2
and
isinstance
(
outputs
,
str
):
outputs
=
outputs
.
decode
(
"utf-8"
)
if
not
self
.
keep_accents
:
if
not
self
.
keep_accents
:
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
unicodedata
.
normalize
(
"NFKD"
,
outputs
)
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
outputs
=
""
.
join
([
c
for
c
in
outputs
if
not
unicodedata
.
combining
(
c
)])
...
@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return
outputs
return
outputs
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
def
_tokenize
(
self
,
text
,
sample
=
False
):
""" Tokenize a string.
""" Tokenize a string. """
return_unicode is used only for py2
"""
text
=
self
.
preprocess_text
(
text
)
text
=
self
.
preprocess_text
(
text
)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if
six
.
PY2
and
isinstance
(
text
,
unicode
):
# noqa: F821
text
=
text
.
encode
(
"utf-8"
)
if
not
sample
:
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
...
@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
else
:
else
:
new_pieces
.
append
(
piece
)
new_pieces
.
append
(
piece
)
# note(zhiliny): convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
new_pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
"utf-8"
)
ret_pieces
.
append
(
piece
)
new_pieces
=
ret_pieces
return
new_pieces
return
new_pieces
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
return
self
.
sp_model
.
PieceToId
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
return
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
"utf-8"
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
...
...
templates/adding_a_new_model/tokenization_xxx.py
View file @
8af25b16
...
@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer):
...
@@ -145,11 +145,11 @@ class XxxTokenizer(PreTrainedTokenizer):
return
split_tokens
return
split_tokens
def
_convert_token_to_id
(
self
,
token
):
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str
/unicode
) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
def
_convert_id_to_token
(
self
,
index
):
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str
ing/unicode
) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
):
...
...
tests/test_hf_api.py
View file @
8af25b16
...
@@ -19,7 +19,6 @@ import time
...
@@ -19,7 +19,6 @@ import time
import
unittest
import
unittest
import
requests
import
requests
import
six
from
requests.exceptions
import
HTTPError
from
requests.exceptions
import
HTTPError
from
transformers.hf_api
import
HfApi
,
HfFolder
,
PresignedUrl
,
S3Obj
from
transformers.hf_api
import
HfApi
,
HfFolder
,
PresignedUrl
,
S3Obj
...
@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest):
...
@@ -50,7 +49,7 @@ class HfApiLoginTest(HfApiCommonTest):
def
test_login_valid
(
self
):
def
test_login_valid
(
self
):
token
=
self
.
_api
.
login
(
username
=
USER
,
password
=
PASS
)
token
=
self
.
_api
.
login
(
username
=
USER
,
password
=
PASS
)
self
.
assertIsInstance
(
token
,
s
ix
.
string_types
)
self
.
assertIsInstance
(
token
,
s
tr
)
class
HfApiEndpointsTest
(
HfApiCommonTest
):
class
HfApiEndpointsTest
(
HfApiCommonTest
):
...
@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
...
@@ -74,7 +73,7 @@ class HfApiEndpointsTest(HfApiCommonTest):
def
test_presign_and_upload
(
self
):
def
test_presign_and_upload
(
self
):
for
FILE_KEY
,
FILE_PATH
in
FILES
:
for
FILE_KEY
,
FILE_PATH
in
FILES
:
access_url
=
self
.
_api
.
presign_and_upload
(
token
=
self
.
_token
,
filename
=
FILE_KEY
,
filepath
=
FILE_PATH
)
access_url
=
self
.
_api
.
presign_and_upload
(
token
=
self
.
_token
,
filename
=
FILE_KEY
,
filepath
=
FILE_PATH
)
self
.
assertIsInstance
(
access_url
,
s
ix
.
string_types
)
self
.
assertIsInstance
(
access_url
,
s
tr
)
with
open
(
FILE_PATH
,
"r"
)
as
f
:
with
open
(
FILE_PATH
,
"r"
)
as
f
:
body
=
f
.
read
()
body
=
f
.
read
()
r
=
requests
.
get
(
access_url
)
r
=
requests
.
get
(
access_url
)
...
...
tests/test_tokenization_utils.py
View file @
8af25b16
...
@@ -16,8 +16,6 @@
...
@@ -16,8 +16,6 @@
import
unittest
import
unittest
import
six
from
transformers
import
PreTrainedTokenizer
from
transformers
import
PreTrainedTokenizer
from
transformers.tokenization_gpt2
import
GPT2Tokenizer
from
transformers.tokenization_gpt2
import
GPT2Tokenizer
...
@@ -34,9 +32,6 @@ class TokenizerUtilsTest(unittest.TestCase):
...
@@ -34,9 +32,6 @@ class TokenizerUtilsTest(unittest.TestCase):
self
.
assertIsInstance
(
tokenizer
,
PreTrainedTokenizer
)
self
.
assertIsInstance
(
tokenizer
,
PreTrainedTokenizer
)
for
special_tok
in
tokenizer
.
all_special_tokens
:
for
special_tok
in
tokenizer
.
all_special_tokens
:
if
six
.
PY2
:
self
.
assertIsInstance
(
special_tok
,
unicode
)
# noqa: F821
else
:
self
.
assertIsInstance
(
special_tok
,
str
)
self
.
assertIsInstance
(
special_tok
,
str
)
special_tok_id
=
tokenizer
.
convert_tokens_to_ids
(
special_tok
)
special_tok_id
=
tokenizer
.
convert_tokens_to_ids
(
special_tok
)
self
.
assertIsInstance
(
special_tok_id
,
int
)
self
.
assertIsInstance
(
special_tok_id
,
int
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment