Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
cd65c41a
Unverified
Commit
cd65c41a
authored
Aug 30, 2019
by
Thomas Wolf
Committed by
GitHub
Aug 30, 2019
Browse files
Merge branch 'master' into xlm-tokenization
parents
69da972a
b66e9b44
Changes
41
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1145 additions
and
53 deletions
+1145
-53
pytorch_transformers/__init__.py
pytorch_transformers/__init__.py
+4
-0
pytorch_transformers/modeling_auto.py
pytorch_transformers/modeling_auto.py
+7
-2
pytorch_transformers/modeling_bert.py
pytorch_transformers/modeling_bert.py
+3
-16
pytorch_transformers/modeling_distilbert.py
pytorch_transformers/modeling_distilbert.py
+751
-0
pytorch_transformers/modeling_gpt2.py
pytorch_transformers/modeling_gpt2.py
+1
-12
pytorch_transformers/modeling_roberta.py
pytorch_transformers/modeling_roberta.py
+3
-3
pytorch_transformers/modeling_transfo_xl.py
pytorch_transformers/modeling_transfo_xl.py
+3
-3
pytorch_transformers/modeling_xlm.py
pytorch_transformers/modeling_xlm.py
+1
-1
pytorch_transformers/modeling_xlnet.py
pytorch_transformers/modeling_xlnet.py
+14
-7
pytorch_transformers/tests/modeling_common_test.py
pytorch_transformers/tests/modeling_common_test.py
+7
-0
pytorch_transformers/tests/modeling_dilbert_test.py
pytorch_transformers/tests/modeling_dilbert_test.py
+217
-0
pytorch_transformers/tests/tokenization_bert_test.py
pytorch_transformers/tests/tokenization_bert_test.py
+2
-2
pytorch_transformers/tests/tokenization_dilbert_test.py
pytorch_transformers/tests/tokenization_dilbert_test.py
+46
-0
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+3
-0
pytorch_transformers/tokenization_distilbert.py
pytorch_transformers/tokenization_distilbert.py
+62
-0
pytorch_transformers/tokenization_gpt2.py
pytorch_transformers/tokenization_gpt2.py
+2
-0
pytorch_transformers/tokenization_openai.py
pytorch_transformers/tokenization_openai.py
+3
-0
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+5
-2
pytorch_transformers/tokenization_transfo_xl.py
pytorch_transformers/tokenization_transfo_xl.py
+4
-0
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+7
-5
No files found.
pytorch_transformers/__init__.py
View file @
cd65c41a
...
@@ -7,6 +7,7 @@ from .tokenization_gpt2 import GPT2Tokenizer
...
@@ -7,6 +7,7 @@ from .tokenization_gpt2 import GPT2Tokenizer
from
.tokenization_xlnet
import
XLNetTokenizer
,
SPIECE_UNDERLINE
from
.tokenization_xlnet
import
XLNetTokenizer
,
SPIECE_UNDERLINE
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_xlm
import
XLMTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_roberta
import
RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
...
@@ -40,6 +41,9 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
...
@@ -40,6 +41,9 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_roberta
import
(
RobertaConfig
,
RobertaForMaskedLM
,
RobertaModel
,
RobertaForSequenceClassification
,
from
.modeling_roberta
import
(
RobertaConfig
,
RobertaForMaskedLM
,
RobertaModel
,
RobertaForSequenceClassification
,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_distilbert
import
(
DistilBertConfig
,
DistilBertForMaskedLM
,
DistilBertModel
,
DistilBertForSequenceClassification
,
DistilBertForQuestionAnswering
,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
.modeling_utils
import
(
WEIGHTS_NAME
,
CONFIG_NAME
,
TF_WEIGHTS_NAME
,
from
.modeling_utils
import
(
WEIGHTS_NAME
,
CONFIG_NAME
,
TF_WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_layer
,
Conv1D
)
PretrainedConfig
,
PreTrainedModel
,
prune_layer
,
Conv1D
)
...
...
pytorch_transformers/modeling_auto.py
View file @
cd65c41a
...
@@ -30,6 +30,7 @@ from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
...
@@ -30,6 +30,7 @@ from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
from
.modeling_xlnet
import
XLNetConfig
,
XLNetModel
from
.modeling_xlnet
import
XLNetConfig
,
XLNetModel
from
.modeling_xlm
import
XLMConfig
,
XLMModel
from
.modeling_xlm
import
XLMConfig
,
XLMModel
from
.modeling_roberta
import
RobertaConfig
,
RobertaModel
from
.modeling_roberta
import
RobertaConfig
,
RobertaModel
from
.modeling_distilbert
import
DistilBertConfig
,
DistilBertModel
from
.modeling_utils
import
PreTrainedModel
,
SequenceSummary
from
.modeling_utils
import
PreTrainedModel
,
SequenceSummary
...
@@ -110,7 +111,9 @@ class AutoConfig(object):
...
@@ -110,7 +111,9 @@ class AutoConfig(object):
assert unused_kwargs == {'foo': False}
assert unused_kwargs == {'foo': False}
"""
"""
if
'roberta'
in
pretrained_model_name_or_path
:
if
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
return
RobertaConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
return
BertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
return
BertConfig
.
from_pretrained
(
pretrained_model_name_or_path
,
**
kwargs
)
...
@@ -225,7 +228,9 @@ class AutoModel(object):
...
@@ -225,7 +228,9 @@ class AutoModel(object):
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
"""
if
'roberta'
in
pretrained_model_name_or_path
:
if
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertModel
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaModel
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
return
RobertaModel
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
return
BertModel
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
return
BertModel
.
from_pretrained
(
pretrained_model_name_or_path
,
*
model_args
,
**
kwargs
)
...
...
pytorch_transformers/modeling_bert.py
View file @
cd65c41a
...
@@ -216,7 +216,7 @@ class BertConfig(PretrainedConfig):
...
@@ -216,7 +216,7 @@ class BertConfig(PretrainedConfig):
self
.
layer_norm_eps
=
layer_norm_eps
self
.
layer_norm_eps
=
layer_norm_eps
else
:
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
"
or the path to a pretrained model config file (str)"
)
...
@@ -224,20 +224,7 @@ try:
...
@@ -224,20 +224,7 @@ try:
from
apex.normalization.fused_layer_norm
import
FusedLayerNorm
as
BertLayerNorm
from
apex.normalization.fused_layer_norm
import
FusedLayerNorm
as
BertLayerNorm
except
(
ImportError
,
AttributeError
)
as
e
:
except
(
ImportError
,
AttributeError
)
as
e
:
logger
.
info
(
"Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex ."
)
logger
.
info
(
"Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex ."
)
class
BertLayerNorm
(
nn
.
Module
):
BertLayerNorm
=
torch
.
nn
.
LayerNorm
def
__init__
(
self
,
hidden_size
,
eps
=
1e-12
):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super
(
BertLayerNorm
,
self
).
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
x
):
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
weight
*
x
+
self
.
bias
class
BertEmbeddings
(
nn
.
Module
):
class
BertEmbeddings
(
nn
.
Module
):
"""Construct the embeddings from word, position and token_type embeddings.
"""Construct the embeddings from word, position and token_type embeddings.
...
@@ -449,7 +436,7 @@ class BertEncoder(nn.Module):
...
@@ -449,7 +436,7 @@ class BertEncoder(nn.Module):
outputs
=
outputs
+
(
all_hidden_states
,)
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
if
self
.
output_attentions
:
outputs
=
outputs
+
(
all_attentions
,)
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
#
outputs, (
hidden states), (attentions)
return
outputs
#
last-layer hidden state, (all
hidden states), (
all
attentions)
class
BertPooler
(
nn
.
Module
):
class
BertPooler
(
nn
.
Module
):
...
...
pytorch_transformers/modeling_distilbert.py
0 → 100644
View file @
cd65c41a
This diff is collapsed.
Click to expand it.
pytorch_transformers/modeling_gpt2.py
View file @
cd65c41a
...
@@ -408,10 +408,6 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
...
@@ -408,10 +408,6 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
(see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -642,10 +638,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -642,10 +638,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
(see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -656,14 +648,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -656,14 +648,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
**m
ultiple_choice
_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
**m
c
_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
Labels for computing the multiple choice classification loss.
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
of the input tensors. (see `input_ids` above)
`multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
**lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
Language modeling loss.
...
...
pytorch_transformers/modeling_roberta.py
View file @
cd65c41a
...
@@ -98,15 +98,15 @@ ROBERTA_INPUTS_DOCSTRING = r"""
...
@@ -98,15 +98,15 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Inputs:
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices of input sequence tokens in the vocabulary.
To match pre-training, RoBERTa input sequence should be formatted with
[CLS] and [SEP]
tokens as follows:
To match pre-training, RoBERTa input sequence should be formatted with
<s> and </s>
tokens as follows:
(a) For sequence pairs:
(a) For sequence pairs:
``tokens:
[CLS] i
s this
j
ack
##son ##ville ? [SEP][SEP] n
o it is not .
[SEP]
``
``tokens:
<s> I
s this
J
ack
sonville ? </s> </s> N
o it is not .
</s>
``
(b) For single sequences:
(b) For single sequences:
``tokens:
[CLS]
the dog is hairy .
[SEP]
``
``tokens:
<s>
the dog is hairy .
</s>
``
Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
the ``add_special_tokens`` parameter set to ``True``.
the ``add_special_tokens`` parameter set to ``True``.
...
...
pytorch_transformers/modeling_transfo_xl.py
View file @
cd65c41a
...
@@ -285,7 +285,7 @@ class TransfoXLConfig(PretrainedConfig):
...
@@ -285,7 +285,7 @@ class TransfoXLConfig(PretrainedConfig):
self
.
init_std
=
init_std
self
.
init_std
=
init_std
else
:
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
"
or the path to a pretrained model config file (str)"
)
@
property
@
property
def
max_position_embeddings
(
self
):
def
max_position_embeddings
(
self
):
...
@@ -1142,10 +1142,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -1142,10 +1142,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
else
:
else
:
mask_shift_len
=
qlen
mask_shift_len
=
qlen
dec_attn_mask
=
(
torch
.
triu
(
all_ones
,
1
+
mlen
)
dec_attn_mask
=
(
torch
.
triu
(
all_ones
,
1
+
mlen
)
+
torch
.
tril
(
all_ones
,
-
mask_shift_len
)).
b
yte
()[:,
:,
None
]
# -1
+
torch
.
tril
(
all_ones
,
-
mask_shift_len
)).
b
ool
()[:,
:,
None
]
# -1
else
:
else
:
dec_attn_mask
=
torch
.
triu
(
dec_attn_mask
=
torch
.
triu
(
word_emb
.
new_ones
(
qlen
,
klen
),
diagonal
=
1
+
mlen
).
b
yte
()[:,:,
None
]
word_emb
.
new_ones
(
qlen
,
klen
),
diagonal
=
1
+
mlen
).
b
ool
()[:,:,
None
]
hids
=
[]
hids
=
[]
attentions
=
[]
attentions
=
[]
...
...
pytorch_transformers/modeling_xlm.py
View file @
cd65c41a
...
@@ -184,7 +184,7 @@ class XLMConfig(PretrainedConfig):
...
@@ -184,7 +184,7 @@ class XLMConfig(PretrainedConfig):
self
.
end_n_top
=
end_n_top
self
.
end_n_top
=
end_n_top
else
:
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
"
or the path to a pretrained model config file (str)"
)
@
property
@
property
def
vocab_size
(
self
):
def
vocab_size
(
self
):
...
...
pytorch_transformers/modeling_xlnet.py
View file @
cd65c41a
...
@@ -306,7 +306,7 @@ class XLNetConfig(PretrainedConfig):
...
@@ -306,7 +306,7 @@ class XLNetConfig(PretrainedConfig):
self
.
end_n_top
=
end_n_top
self
.
end_n_top
=
end_n_top
else
:
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
"
or the path to a pretrained model config file (str)"
)
@
property
@
property
def
max_position_embeddings
(
self
):
def
max_position_embeddings
(
self
):
...
@@ -677,8 +677,11 @@ XLNET_INPUTS_DOCSTRING = r"""
...
@@ -677,8 +677,11 @@ XLNET_INPUTS_DOCSTRING = r"""
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**mems**: (`optional`)
**mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as
com
put
ed
by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as
out
put by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
**perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
**perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
...
@@ -705,7 +708,8 @@ class XLNetModel(XLNetPreTrainedModel):
...
@@ -705,7 +708,8 @@ class XLNetModel(XLNetPreTrainedModel):
**mems**:
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
of shape ``(batch_size, sequence_length, hidden_size)``:
...
@@ -859,7 +863,7 @@ class XLNetModel(XLNetPreTrainedModel):
...
@@ -859,7 +863,7 @@ class XLNetModel(XLNetPreTrainedModel):
target_mapping
=
target_mapping
.
permute
(
1
,
2
,
0
).
contiguous
()
if
target_mapping
is
not
None
else
None
target_mapping
=
target_mapping
.
permute
(
1
,
2
,
0
).
contiguous
()
if
target_mapping
is
not
None
else
None
qlen
,
bsz
=
input_ids
.
shape
[
0
],
input_ids
.
shape
[
1
]
qlen
,
bsz
=
input_ids
.
shape
[
0
],
input_ids
.
shape
[
1
]
mlen
=
mems
[
0
].
shape
[
0
]
if
mems
is
not
None
else
0
mlen
=
mems
[
0
].
shape
[
0
]
if
mems
is
not
None
and
mems
[
0
]
is
not
None
else
0
klen
=
mlen
+
qlen
klen
=
mlen
+
qlen
dtype_float
=
next
(
self
.
parameters
()).
dtype
dtype_float
=
next
(
self
.
parameters
()).
dtype
...
@@ -1011,7 +1015,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
...
@@ -1011,7 +1015,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
**mems**:
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
of shape ``(batch_size, sequence_length, hidden_size)``:
...
@@ -1091,7 +1096,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
...
@@ -1091,7 +1096,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
**mems**:
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
of shape ``(batch_size, sequence_length, hidden_size)``:
...
@@ -1189,7 +1195,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
...
@@ -1189,7 +1195,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
**mems**:
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
of shape ``(batch_size, sequence_length, hidden_size)``:
...
...
pytorch_transformers/tests/modeling_common_test.py
View file @
cd65c41a
...
@@ -49,6 +49,7 @@ class CommonTestCases:
...
@@ -49,6 +49,7 @@ class CommonTestCases:
test_torchscript
=
True
test_torchscript
=
True
test_pruning
=
True
test_pruning
=
True
test_resize_embeddings
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
def
test_initialization
(
self
):
def
test_initialization
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -159,6 +160,9 @@ class CommonTestCases:
...
@@ -159,6 +160,9 @@ class CommonTestCases:
def
test_headmasking
(
self
):
def
test_headmasking
(
self
):
if
not
self
.
test_head_masking
:
return
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
.
output_attentions
=
True
config
.
output_attentions
=
True
...
@@ -282,6 +286,9 @@ class CommonTestCases:
...
@@ -282,6 +286,9 @@ class CommonTestCases:
self
.
assertTrue
(
models_equal
)
self
.
assertTrue
(
models_equal
)
def
test_tie_model_weights
(
self
):
def
test_tie_model_weights
(
self
):
if
not
self
.
test_torchscript
:
return
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
def
check_same_values
(
layer_1
,
layer_2
):
def
check_same_values
(
layer_1
,
layer_2
):
...
...
pytorch_transformers/tests/modeling_dilbert_test.py
0 → 100644
View file @
cd65c41a
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
from
pytorch_transformers
import
(
DistilBertConfig
,
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
from
pytorch_transformers.modeling_distilbert
import
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from
.modeling_common_test
import
(
CommonTestCases
,
ConfigTester
,
ids_tensor
)
class
DistilBertModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
DistilBertModel
,
DistilBertForMaskedLM
,
DistilBertForQuestionAnswering
,
DistilBertForSequenceClassification
)
test_pruning
=
True
test_torchscript
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
class
DistilBertModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_input_mask
=
True
,
use_token_type_ids
=
False
,
use_labels
=
True
,
vocab_size
=
99
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_input_mask
=
use_input_mask
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
DistilBertConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
dim
=
self
.
hidden_size
,
n_layers
=
self
.
num_hidden_layers
,
n_heads
=
self
.
num_attention_heads
,
hidden_dim
=
self
.
intermediate_size
,
hidden_act
=
self
.
hidden_act
,
dropout
=
self
.
hidden_dropout_prob
,
attention_dropout
=
self
.
attention_probs_dropout_prob
,
max_position_embeddings
=
self
.
max_position_embeddings
,
initializer_range
=
self
.
initializer_range
)
return
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
def
create_and_check_distilbert_model
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertModel
(
config
=
config
)
model
.
eval
()
(
sequence_output
,)
=
model
(
input_ids
,
input_mask
)
(
sequence_output
,)
=
model
(
input_ids
)
result
=
{
"sequence_output"
:
sequence_output
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
def
create_and_check_distilbert_for_masked_lm
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForMaskedLM
(
config
=
config
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
masked_lm_labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
def
create_and_check_distilbert_for_question_answering
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
,
sequence_labels
)
result
=
{
"loss"
:
loss
,
"start_logits"
:
start_logits
,
"end_logits"
:
end_logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"start_logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"end_logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
])
self
.
check_loss_output
(
result
)
def
create_and_check_distilbert_for_sequence_classification
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForSequenceClassification
(
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"logits"
].
size
()),
[
self
.
batch_size
,
self
.
num_labels
])
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
DistilBertModelTest
.
DistilBertModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
DistilBertConfig
,
dim
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_distilbert_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_model
(
*
config_and_inputs
)
def
test_for_masked_lm
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_masked_lm
(
*
config_and_inputs
)
def
test_for_question_answering
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_question_answering
(
*
config_and_inputs
)
def
test_for_sequence_classification
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_distilbert_for_sequence_classification
(
*
config_and_inputs
)
# @pytest.mark.slow
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/pytorch_transformers_test/"
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
# model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
# shutil.rmtree(cache_dir)
# self.assertIsNotNone(model)
if
__name__
==
"__main__"
:
unittest
.
main
()
pytorch_transformers/tests/tokenization_bert_test.py
View file @
cd65c41a
...
@@ -50,7 +50,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -50,7 +50,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
return
input_text
,
output_text
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
def
test_full_tokenizer
(
self
):
tokenizer
=
BertT
okenizer
(
self
.
vocab_file
)
tokenizer
=
self
.
t
okenizer
_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"UNwant
\u00E9
d,running"
)
tokens
=
tokenizer
.
tokenize
(
u
"UNwant
\u00E9
d,running"
)
self
.
assertListEqual
(
tokens
,
[
"un"
,
"##want"
,
"##ed"
,
","
,
"runn"
,
"##ing"
])
self
.
assertListEqual
(
tokens
,
[
"un"
,
"##want"
,
"##ed"
,
","
,
"runn"
,
"##ing"
])
...
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
self
.
assertFalse
(
_is_punctuation
(
u
" "
))
def
test_sequence_builders
(
self
):
def
test_sequence_builders
(
self
):
tokenizer
=
BertT
okenizer
.
from_pretrained
(
"bert-base-uncased"
)
tokenizer
=
self
.
t
okenizer
_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
...
...
pytorch_transformers/tests/tokenization_dilbert_test.py
0 → 100644
View file @
cd65c41a
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
pytorch_transformers.tokenization_distilbert
import
(
DistilBertTokenizer
)
from
.tokenization_tests_commons
import
CommonTestCases
from
.tokenization_bert_test
import
BertTokenizationTest
class
DistilBertTokenizationTest
(
BertTokenizationTest
):
tokenizer_class
=
DistilBertTokenizer
def
get_tokenizer
(
self
):
return
DistilBertTokenizer
.
from_pretrained
(
self
.
tmpdirname
)
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_sentence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_sentences_pair
(
text
,
text_2
)
assert
encoded_sentence
==
[
101
]
+
text
+
[
102
]
assert
encoded_pair
==
[
101
]
+
text
+
[
102
]
+
text_2
+
[
102
]
if
__name__
==
'__main__'
:
unittest
.
main
()
pytorch_transformers/tokenization_bert.py
View file @
cd65c41a
...
@@ -143,6 +143,9 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -143,6 +143,9 @@ class BertTokenizer(PreTrainedTokenizer):
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
if
not
os
.
path
.
isfile
(
vocab_file
):
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
...
...
pytorch_transformers/tokenization_distilbert.py
0 → 100644
View file @
cd65c41a
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DistilBERT."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
collections
import
logging
import
os
import
unicodedata
from
io
import
open
from
.tokenization_bert
import
BertTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'distilbert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
,
'distilbert-base-uncased-distilled-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'distilbert-base-uncased'
:
512
,
'distilbert-base-uncased-distilled-squad'
:
512
,
}
class
DistilBertTokenizer
(
BertTokenizer
):
r
"""
Constructs a DistilBertTokenizer.
:class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_wordpiece_only=False
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pytorch_transformers/tokenization_gpt2.py
View file @
cd65c41a
...
@@ -108,6 +108,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
...
@@ -108,6 +108,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def
__init__
(
self
,
vocab_file
,
merges_file
,
errors
=
'replace'
,
unk_token
=
"<|endoftext|>"
,
def
__init__
(
self
,
vocab_file
,
merges_file
,
errors
=
'replace'
,
unk_token
=
"<|endoftext|>"
,
bos_token
=
"<|endoftext|>"
,
eos_token
=
"<|endoftext|>"
,
**
kwargs
):
bos_token
=
"<|endoftext|>"
,
eos_token
=
"<|endoftext|>"
,
**
kwargs
):
super
(
GPT2Tokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
)
super
(
GPT2Tokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
...
...
pytorch_transformers/tokenization_openai.py
View file @
cd65c41a
...
@@ -87,6 +87,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
...
@@ -87,6 +87,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
super
(
OpenAIGPTTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
super
(
OpenAIGPTTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
try
:
try
:
import
ftfy
import
ftfy
from
spacy.lang.en
import
English
from
spacy.lang.en
import
English
...
...
pytorch_transformers/tokenization_roberta.py
View file @
cd65c41a
...
@@ -77,6 +77,9 @@ class RobertaTokenizer(PreTrainedTokenizer):
...
@@ -77,6 +77,9 @@ class RobertaTokenizer(PreTrainedTokenizer):
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
**
kwargs
)
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
4
# take into account special tokens
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
errors
=
errors
# how to handle errors in decoding
...
@@ -163,14 +166,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
...
@@ -163,14 +166,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
"""
"""
Adds special tokens to a sequence for sequence classification tasks.
Adds special tokens to a sequence for sequence classification tasks.
A RoBERTa sequence has the following format:
[CLS] X [SEP]
A RoBERTa sequence has the following format:
<s> X </s>
"""
"""
return
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
+
token_ids
+
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
return
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
+
token_ids
+
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
"""
Adds special tokens to a sequence pair for sequence classification tasks.
Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format:
[CLS] A [SEP][SEP] B [SEP]
A RoBERTa sequence pair has the following format:
<s> A </s></s> B </s>
"""
"""
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
sep
=
[
self
.
_convert_token_to_id
(
self
.
sep_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
cls
=
[
self
.
_convert_token_to_id
(
self
.
cls_token
)]
...
...
pytorch_transformers/tokenization_transfo_xl.py
View file @
cd65c41a
...
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
...
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
additional_special_tokens
=
additional_special_tokens
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
# no default special tokens - you can update this value if you add special tokens
if
never_split
is
None
:
if
never_split
is
None
:
never_split
=
self
.
all_special_tokens
never_split
=
self
.
all_special_tokens
if
special
is
None
:
if
special
is
None
:
...
...
pytorch_transformers/tokenization_utils.py
View file @
cd65c41a
...
@@ -638,10 +638,12 @@ class PreTrainedTokenizer(object):
...
@@ -638,10 +638,12 @@ class PreTrainedTokenizer(object):
return
first_sentence_tokens
,
second_sentence_tokens
return
first_sentence_tokens
,
second_sentence_tokens
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
def
add_special_tokens_single_sentence
(
self
,
token_ids
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
raise
NotImplementedError
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
""" Converts a single index or a sequence of indices (integers) in a token "
...
@@ -684,9 +686,9 @@ class PreTrainedTokenizer(object):
...
@@ -684,9 +686,9 @@ class PreTrainedTokenizer(object):
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
text
=
self
.
convert_tokens_to_string
(
filtered_tokens
)
if
self
.
sep_token
is
not
None
and
self
.
sep_token
in
text
:
if
self
.
_
sep_token
is
not
None
and
self
.
_
sep_token
in
text
:
text
=
text
.
replace
(
self
.
cls_token
,
self
.
sep_token
)
text
=
text
.
replace
(
self
.
_
cls_token
,
self
.
_
sep_token
)
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
sep_token
)))
split_text
=
list
(
filter
(
lambda
sentence
:
len
(
sentence
)
>
0
,
text
.
split
(
self
.
_
sep_token
)))
if
clean_up_tokenization_spaces
:
if
clean_up_tokenization_spaces
:
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
clean_text
=
[
self
.
clean_up_tokenization
(
text
)
for
text
in
split_text
]
return
clean_text
return
clean_text
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment