Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6b763d04
Unverified
Commit
6b763d04
authored
Jul 26, 2019
by
Thomas Wolf
Committed by
GitHub
Jul 26, 2019
Browse files
Merge pull request #911 from huggingface/small_fixes
Small fixes
parents
c054b5ee
7b6e474c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
14 deletions
+34
-14
pytorch_transformers/__init__.py
pytorch_transformers/__init__.py
+10
-10
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+24
-4
No files found.
pytorch_transformers/__init__.py
View file @
6b763d04
...
...
@@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_utils
import
(
PreTrainedTokenizer
,
clean_up_tokenization
)
from
.modeling_bert
import
(
BertConfig
,
BertModel
,
BertForPreTraining
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
BertForSequenceClassification
,
BertForMultipleChoice
,
BertForTokenClassification
,
BertForQuestionAnswering
,
load_tf_weights_in_bert
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
)
from
.modeling_openai
import
(
OpenAIGPTConfig
,
OpenAIGPTModel
,
from
.modeling_bert
import
(
BertConfig
,
BertPreTrainedModel
,
BertModel
,
BertForPreTraining
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
BertForSequenceClassification
,
BertForMultipleChoice
,
BertForTokenClassification
,
BertForQuestionAnswering
,
load_tf_weights_in_bert
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
)
from
.modeling_openai
import
(
OpenAIGPTConfig
,
OpenAIGPTPreTrainedModel
,
OpenAIGPTModel
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
,
load_tf_weights_in_openai_gpt
,
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_transfo_xl
import
(
TransfoXLConfig
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
from
.modeling_transfo_xl
import
(
TransfoXLConfig
,
TransfoXLPreTrainedModel
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
load_tf_weights_in_transfo_xl
,
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
,
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_gpt2
import
(
GPT2Config
,
GPT2Model
,
from
.modeling_gpt2
import
(
GPT2Config
,
GPT2PreTrainedModel
,
GPT2Model
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
,
load_tf_weights_in_gpt2
,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
...
@@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig,
XLNetForSequenceClassification
,
XLNetForQuestionAnswering
,
load_tf_weights_in_xlnet
,
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
,
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_xlm
import
(
XLMConfig
,
XLMModel
,
from
.modeling_xlm
import
(
XLMConfig
,
XLMPreTrainedModel
,
XLMModel
,
XLMWithLMHeadModel
,
XLMForSequenceClassification
,
XLMForQuestionAnswering
,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
...
...
pytorch_transformers/tokenization_utils.py
View file @
6b763d04
...
...
@@ -160,26 +160,46 @@ class PreTrainedTokenizer(object):
s3_models
=
list
(
cls
.
max_model_input_sizes
.
keys
())
vocab_files
=
{}
if
pretrained_model_name_or_path
in
s3_models
:
# Get the vocabulary from AWS S3 bucket
for
file_id
,
map_list
in
cls
.
pretrained_vocab_files_map
.
items
():
vocab_files
[
file_id
]
=
map_list
[
pretrained_model_name_or_path
]
else
:
# Get the vocabulary from local files
logger
.
info
(
"Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
s3_models
),
pretrained_model_name_or_path
))
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
all_vocab_files_names
.
update
(
cls
.
vocab_files_names
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
# Look for the tokenizer main vocabulary files
for
file_id
,
file_name
in
cls
.
vocab_files_names
.
items
():
if
os
.
path
.
isdir
(
pretrained_model_name_or_path
):
# If a directory is provided we look for the standard filenames
full_file_name
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
file_name
)
else
:
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name
=
pretrained_model_name_or_path
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
# Look for the additional tokens files
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
# If a path to a file was provided, get the parent directory
saved_directory
=
pretrained_model_name_or_path
if
os
.
path
.
exists
(
saved_directory
)
and
not
os
.
path
.
isdir
(
saved_directory
):
saved_directory
=
os
.
path
.
dirname
(
saved_directory
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
full_file_name
=
os
.
path
.
join
(
saved_directory
,
file_name
)
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We won't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
if
all
(
full_file_name
is
None
for
full_file_name
in
vocab_files
.
values
()):
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment