Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7b6e474c
Commit
7b6e474c
authored
Jul 26, 2019
by
thomwolf
Browse files
fix #901
parent
632d7114
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
24 additions
and
4 deletions
+24
-4
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+24
-4
No files found.
pytorch_transformers/tokenization_utils.py
View file @
7b6e474c
...
...
@@ -160,26 +160,46 @@ class PreTrainedTokenizer(object):
s3_models
=
list
(
cls
.
max_model_input_sizes
.
keys
())
vocab_files
=
{}
if
pretrained_model_name_or_path
in
s3_models
:
# Get the vocabulary from AWS S3 bucket
for
file_id
,
map_list
in
cls
.
pretrained_vocab_files_map
.
items
():
vocab_files
[
file_id
]
=
map_list
[
pretrained_model_name_or_path
]
else
:
# Get the vocabulary from local files
logger
.
info
(
"Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
))
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
all_vocab_files_names
.
update
(
cls
.
vocab_files_names
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
# Look for the tokenizer main vocabulary files
for
file_id
,
file_name
in
cls
.
vocab_files_names
.
items
():
if
os
.
path
.
isdir
(
pretrained_model_name_or_path
):
# If a directory is provided we look for the standard filenames
full_file_name
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
file_name
)
else
:
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name
=
pretrained_model_name_or_path
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
# Look for the additional tokens files
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
# If a path to a file was provided, get the parent directory
saved_directory
=
pretrained_model_name_or_path
if
os
.
path
.
exists
(
saved_directory
)
and
not
os
.
path
.
isdir
(
saved_directory
):
saved_directory
=
os
.
path
.
dirname
(
saved_directory
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
full_file_name
=
os
.
path
.
join
(
saved_directory
,
file_name
)
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
if
all
(
full_file_name
is
None
for
full_file_name
in
vocab_files
.
values
()):
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment