Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1c12ee0e
Commit
1c12ee0e
authored
Dec 20, 2019
by
thomwolf
Browse files
fixing xlm-roberta tokenizer max_length and automodels
parent
65c75fc5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
33 additions
and
13 deletions
+33
-13
transformers/modeling_auto.py
transformers/modeling_auto.py
+14
-3
transformers/modeling_utils.py
transformers/modeling_utils.py
+1
-1
transformers/pipelines.py
transformers/pipelines.py
+3
-2
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+5
-1
transformers/tokenization_xlm_roberta.py
transformers/tokenization_xlm_roberta.py
+10
-6
No files found.
transformers/modeling_auto.py
View file @
1c12ee0e
...
...
@@ -20,7 +20,7 @@ import logging
from
.configuration_auto
import
(
AlbertConfig
,
BertConfig
,
CamembertConfig
,
CTRLConfig
,
DistilBertConfig
,
GPT2Config
,
OpenAIGPTConfig
,
RobertaConfig
,
TransfoXLConfig
,
XLMConfig
,
XLNetConfig
)
TransfoXLConfig
,
XLMConfig
,
XLNetConfig
,
XLMRobertaConfig
)
from
.modeling_bert
import
BertModel
,
BertForMaskedLM
,
BertForSequenceClassification
,
BertForQuestionAnswering
,
\
BertForTokenClassification
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
...
...
@@ -41,7 +41,8 @@ from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertF
from
.modeling_albert
import
AlbertModel
,
AlbertForMaskedLM
,
AlbertForSequenceClassification
,
\
AlbertForQuestionAnswering
,
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_t5
import
T5Model
,
T5WithLMHeadModel
,
T5_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_xlm_roberta
import
XLMRobertaModel
,
XLMRobertaForMaskedLM
,
XLMRobertaForSequenceClassification
,
XLMRobertaForMultipleChoice
,
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_xlm_roberta
import
XLMRobertaModel
,
XLMRobertaForMaskedLM
,
XLMRobertaForSequenceClassification
,
\
XLMRobertaForMultipleChoice
,
XLMRobertaForTokenClassification
,
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_utils
import
PreTrainedModel
,
SequenceSummary
...
...
@@ -146,6 +147,8 @@ class AutoModel(object):
return
AlbertModel
(
config
)
elif
isinstance
(
config
,
CamembertConfig
):
return
CamembertModel
(
config
)
elif
isinstance
(
config
,
XLMRobertaConfig
):
return
XLMRobertaModel
(
config
)
raise
ValueError
(
"Unrecognized configuration class {}"
.
format
(
config
))
@
classmethod
...
...
@@ -333,6 +336,8 @@ class AutoModelWithLMHead(object):
return
XLMWithLMHeadModel
(
config
)
elif
isinstance
(
config
,
CTRLConfig
):
return
CTRLLMHeadModel
(
config
)
elif
isinstance
(
config
,
XLMRobertaConfig
):
return
XLMRobertaForMaskedLM
(
config
)
raise
ValueError
(
"Unrecognized configuration class {}"
.
format
(
config
))
@
classmethod
...
...
@@ -509,6 +514,8 @@ class AutoModelForSequenceClassification(object):
return
XLNetForSequenceClassification
(
config
)
elif
isinstance
(
config
,
XLMConfig
):
return
XLMForSequenceClassification
(
config
)
elif
isinstance
(
config
,
XLMRobertaConfig
):
return
XLMRobertaForSequenceClassification
(
config
)
raise
ValueError
(
"Unrecognized configuration class {}"
.
format
(
config
))
@
classmethod
...
...
@@ -787,6 +794,8 @@ class AutoModelForTokenClassification:
return
XLNetForTokenClassification
(
config
)
elif
isinstance
(
config
,
RobertaConfig
):
return
RobertaForTokenClassification
(
config
)
elif
isinstance
(
config
,
XLMRobertaConfig
):
return
XLMRobertaForTokenClassification
(
config
)
raise
ValueError
(
"Unrecognized configuration class {}"
.
format
(
config
))
@
classmethod
...
...
@@ -865,6 +874,8 @@ class AutoModelForTokenClassification:
return
CamembertForTokenClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
elif
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertForTokenClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
elif
'xlm-roberta'
in
pretrained_model_name_or_path
:
return
XLMRobertaForTokenClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaForTokenClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
...
...
@@ -873,4 +884,4 @@ class AutoModelForTokenClassification:
return
XLNetForTokenClassification
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
raise
ValueError
(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'camembert', 'distilbert', 'roberta'"
.
format
(
pretrained_model_name_or_path
))
"'bert', 'xlnet', 'camembert', 'distilbert',
'xlm-roberta',
'roberta'"
.
format
(
pretrained_model_name_or_path
))
transformers/modeling_utils.py
View file @
1c12ee0e
...
...
@@ -415,7 +415,7 @@ class PreTrainedModel(nn.Module):
state_dict
=
torch
.
load
(
resolved_archive_file
,
map_location
=
'cpu'
)
except
:
raise
OSError
(
"Unable to load weights from pytorch checkpoint file. "
"If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
)
"If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
)
missing_keys
=
[]
unexpected_keys
=
[]
...
...
transformers/pipelines.py
View file @
1c12ee0e
...
...
@@ -49,7 +49,7 @@ logger = logging.getLogger(__name__)
def
get_framework
(
model
=
None
):
""" Select framework (TensorFlow/PyTorch) to use.
If both frameworks are installed and no specific model is provided, defaults to using
TensorFlow
.
If both frameworks are installed and no specific model is provided, defaults to using
PyTorch
.
"""
if
is_tf_available
()
and
is_torch_available
()
and
model
is
not
None
and
not
isinstance
(
model
,
str
):
# Both framework are available but the use supplied a model class instance.
...
...
@@ -60,7 +60,8 @@ def get_framework(model=None):
"To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
"To install PyTorch, read the instructions at https://pytorch.org/."
)
else
:
framework
=
'tf'
if
is_tf_available
()
else
'pt'
# framework = 'tf' if is_tf_available() else 'pt'
framework
=
'pt'
if
is_torch_available
()
else
'tf'
return
framework
class
ArgumentHandler
(
ABC
):
...
...
transformers/tokenization_utils.py
View file @
1c12ee0e
...
...
@@ -434,7 +434,11 @@ class PreTrainedTokenizer(object):
init_kwargs
[
key
]
=
value
# Instantiate tokenizer.
tokenizer
=
cls
(
*
init_inputs
,
**
init_kwargs
)
try
:
tokenizer
=
cls
(
*
init_inputs
,
**
init_kwargs
)
except
OSError
:
OSError
(
"Unable to load vocabulary from file. "
"Please check that the provided vocabulary is accessible and not corrupted."
)
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
tokenizer
.
init_inputs
=
init_inputs
...
...
transformers/tokenization_xlm_roberta.py
View file @
1c12ee0e
...
...
@@ -40,8 +40,12 @@ PRETRAINED_VOCAB_FILES_MAP = {
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'xlm-roberta-base'
:
None
,
'xlm-roberta-large'
:
None
,
'xlm-roberta-base'
:
512
,
'xlm-roberta-large'
:
512
,
'xlm-roberta-large-finetuned-conll02-dutch'
:
512
,
'xlm-roberta-large-finetuned-conll02-spanish'
:
512
,
'xlm-roberta-large-finetuned-conll03-english'
:
512
,
'xlm-roberta-large-finetuned-conll03-german'
:
512
,
}
class
XLMRobertaTokenizer
(
PreTrainedTokenizer
):
...
...
@@ -58,10 +62,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
def
__init__
(
self
,
vocab_file
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
sep_token
=
"</s>"
,
cls_token
=
"<s>"
,
unk_token
=
"<unk>"
,
pad_token
=
'<pad>'
,
mask_token
=
'<mask>'
,
**
kwargs
):
super
(
XLMRobertaTokenizer
,
self
).
__init__
(
max_len
=
512
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
**
kwargs
)
super
(
XLMRobertaTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment