Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6b763d04
"vscode:/vscode.git/clone" did not exist on "f91ce0b8035a8cd153fcd1883efd7469e554cbd6"
Unverified
Commit
6b763d04
authored
Jul 26, 2019
by
Thomas Wolf
Committed by
GitHub
Jul 26, 2019
Browse files
Merge pull request #911 from huggingface/small_fixes
Small fixes
parents
c054b5ee
7b6e474c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
14 deletions
+34
-14
pytorch_transformers/__init__.py
pytorch_transformers/__init__.py
+10
-10
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+24
-4
No files found.
pytorch_transformers/__init__.py
View file @
6b763d04
...
...
@@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_utils
import
(
PreTrainedTokenizer
,
clean_up_tokenization
)
from
.modeling_bert
import
(
BertConfig
,
BertModel
,
BertForPreTraining
,
from
.modeling_bert
import
(
BertConfig
,
BertPreTrainedModel
,
BertModel
,
BertForPreTraining
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
BertForSequenceClassification
,
BertForMultipleChoice
,
BertForTokenClassification
,
BertForQuestionAnswering
,
load_tf_weights_in_bert
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
)
from
.modeling_openai
import
(
OpenAIGPTConfig
,
OpenAIGPTModel
,
from
.modeling_openai
import
(
OpenAIGPTConfig
,
OpenAIGPTPreTrainedModel
,
OpenAIGPTModel
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
,
load_tf_weights_in_openai_gpt
,
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_transfo_xl
import
(
TransfoXLConfig
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
from
.modeling_transfo_xl
import
(
TransfoXLConfig
,
TransfoXLPreTrainedModel
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
load_tf_weights_in_transfo_xl
,
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_gpt2
import
(
GPT2Config
,
GPT2Model
,
from
.modeling_gpt2
import
(
GPT2Config
,
GPT2PreTrainedModel
,
GPT2Model
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
,
load_tf_weights_in_gpt2
,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
...
@@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig,
XLNetForSequenceClassification
,
XLNetForQuestionAnswering
,
load_tf_weights_in_xlnet
,
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
,
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_xlm
import
(
XLMConfig
,
XLMModel
,
from
.modeling_xlm
import
(
XLMConfig
,
XLMPreTrainedModel
,
XLMModel
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMForQuestionAnswering
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
...
pytorch_transformers/tokenization_utils.py
View file @
6b763d04
...
...
@@ -160,26 +160,46 @@ class PreTrainedTokenizer(object):
s3_models
=
list
(
cls
.
max_model_input_sizes
.
keys
())
vocab_files
=
{}
if
pretrained_model_name_or_path
in
s3_models
:
# Get the vocabulary from AWS S3 bucket
for
file_id
,
map_list
in
cls
.
pretrained_vocab_files_map
.
items
():
vocab_files
[
file_id
]
=
map_list
[
pretrained_model_name_or_path
]
else
:
# Get the vocabulary from local files
logger
.
info
(
"Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
))
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
all_vocab_files_names
.
update
(
cls
.
vocab_files_names
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
# Look for the tokenizer main vocabulary files
for
file_id
,
file_name
in
cls
.
vocab_files_names
.
items
():
if
os
.
path
.
isdir
(
pretrained_model_name_or_path
):
# If a directory is provided we look for the standard filenames
full_file_name
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
file_name
)
else
:
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name
=
pretrained_model_name_or_path
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
# Look for the additional tokens files
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
# If a path to a file was provided, get the parent directory
saved_directory
=
pretrained_model_name_or_path
if
os
.
path
.
exists
(
saved_directory
)
and
not
os
.
path
.
isdir
(
saved_directory
):
saved_directory
=
os
.
path
.
dirname
(
saved_directory
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
full_file_name
=
os
.
path
.
join
(
saved_directory
,
file_name
)
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
if
all
(
full_file_name
is
None
for
full_file_name
in
vocab_files
.
values
()):
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment