Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e92bcb7e
Unverified
Commit
e92bcb7e
authored
Dec 14, 2019
by
Thomas Wolf
Committed by
GitHub
Dec 14, 2019
Browse files
Merge pull request #1739 from huggingface/t5
[WIP] Adding Google T5 model
parents
b6d4284b
cbb368ca
Changes
32
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
806 additions
and
80 deletions
+806
-80
transformers/modeling_tf_xlm.py
transformers/modeling_tf_xlm.py
+1
-1
transformers/modeling_utils.py
transformers/modeling_utils.py
+12
-6
transformers/modeling_xlm.py
transformers/modeling_xlm.py
+11
-1
transformers/tests/modeling_common_test.py
transformers/tests/modeling_common_test.py
+85
-38
transformers/tests/modeling_t5_test.py
transformers/tests/modeling_t5_test.py
+185
-0
transformers/tests/modeling_tf_common_test.py
transformers/tests/modeling_tf_common_test.py
+79
-31
transformers/tests/modeling_tf_t5_test.py
transformers/tests/modeling_tf_t5_test.py
+172
-0
transformers/tests/modeling_tf_transfo_xl_test.py
transformers/tests/modeling_tf_transfo_xl_test.py
+1
-1
transformers/tests/modeling_transfo_xl_test.py
transformers/tests/modeling_transfo_xl_test.py
+1
-1
transformers/tests/tokenization_t5_test.py
transformers/tests/tokenization_t5_test.py
+77
-0
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+6
-1
transformers/tokenization_t5.py
transformers/tokenization_t5.py
+176
-0
No files found.
transformers/modeling_tf_xlm.py
View file @
e92bcb7e
...
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
...
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
langs_list
=
tf
.
constant
([[
1
,
1
,
0
,
0
,
1
],
[
1
,
1
,
1
,
0
,
0
],
[
1
,
0
,
0
,
1
,
1
]])
langs_list
=
tf
.
constant
([[
1
,
1
,
0
,
0
,
1
],
[
1
,
1
,
1
,
0
,
0
],
[
1
,
0
,
0
,
1
,
1
]])
else
:
else
:
langs_list
=
None
langs_list
=
None
return
[
inputs_list
,
attns_list
,
langs_list
]
return
{
'input_ids'
:
inputs_list
,
'attention_mask'
:
attns_list
,
'langs'
:
langs_list
}
XLM_START_DOCSTRING
=
r
""" The XLM model was proposed in
XLM_START_DOCSTRING
=
r
""" The XLM model was proposed in
...
...
transformers/modeling_utils.py
View file @
e92bcb7e
...
@@ -31,12 +31,11 @@ from torch.nn import CrossEntropyLoss
...
@@ -31,12 +31,11 @@ from torch.nn import CrossEntropyLoss
from
torch.nn
import
functional
as
F
from
torch.nn
import
functional
as
F
from
.configuration_utils
import
PretrainedConfig
from
.configuration_utils
import
PretrainedConfig
from
.file_utils
import
(
TF2_WEIGHTS_NAME
,
TF_WEIGHTS_NAME
,
WEIGHTS_NAME
,
from
.file_utils
import
(
TF2_WEIGHTS_NAME
,
TF_WEIGHTS_NAME
,
WEIGHTS_NAME
,
DUMMY_INPUTS
,
cached_path
,
hf_bucket_url
,
is_remote_url
)
cached_path
,
hf_bucket_url
,
is_remote_url
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
try
:
try
:
from
torch.nn
import
Identity
from
torch.nn
import
Identity
except
ImportError
:
except
ImportError
:
...
@@ -72,6 +71,15 @@ class PreTrainedModel(nn.Module):
...
@@ -72,6 +71,15 @@ class PreTrainedModel(nn.Module):
load_tf_weights
=
lambda
model
,
config
,
path
:
None
load_tf_weights
=
lambda
model
,
config
,
path
:
None
base_model_prefix
=
""
base_model_prefix
=
""
@
property
def
dummy_inputs
(
self
):
""" Dummy inputs to do a forward pass in the network.
Returns:
torch.Tensor with dummy inputs
"""
return
{
'input_ids'
:
torch
.
tensor
(
DUMMY_INPUTS
)}
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
PreTrainedModel
,
self
).
__init__
()
super
(
PreTrainedModel
,
self
).
__init__
()
if
not
isinstance
(
config
,
PretrainedConfig
):
if
not
isinstance
(
config
,
PretrainedConfig
):
...
@@ -161,7 +169,6 @@ class PreTrainedModel(nn.Module):
...
@@ -161,7 +169,6 @@ class PreTrainedModel(nn.Module):
base_model
.
vocab_size
=
new_num_tokens
base_model
.
vocab_size
=
new_num_tokens
# Tie weights again if needed
# Tie weights again if needed
if
hasattr
(
self
,
'tie_weights'
):
self
.
tie_weights
()
self
.
tie_weights
()
return
model_embeds
return
model_embeds
...
@@ -478,8 +485,7 @@ class PreTrainedModel(nn.Module):
...
@@ -478,8 +485,7 @@ class PreTrainedModel(nn.Module):
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
model
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
model
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
if
hasattr
(
model
,
'tie_weights'
):
model
.
tie_weights
()
# make sure word embedding weights are still tied if needed
model
.
tie_weights
()
# make sure word embedding weights are still tied
# Set model in evaluation mode to desactivate DropOut modules by default
# Set model in evaluation mode to desactivate DropOut modules by default
model
.
eval
()
model
.
eval
()
...
...
transformers/modeling_xlm.py
View file @
e92bcb7e
...
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
...
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
super
(
XLMPreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
(
XLMPreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
@
property
def
dummy_inputs
(
self
):
inputs_list
=
torch
.
tensor
([[
7
,
6
,
0
,
0
,
1
],
[
1
,
2
,
3
,
0
,
0
],
[
0
,
0
,
0
,
4
,
5
]])
attns_list
=
torch
.
tensor
([[
1
,
1
,
0
,
0
,
1
],
[
1
,
1
,
1
,
0
,
0
],
[
1
,
0
,
0
,
1
,
1
]])
if
self
.
config
.
use_lang_emb
and
self
.
config
.
n_langs
>
1
:
langs_list
=
torch
.
tensor
([[
1
,
1
,
0
,
0
,
1
],
[
1
,
1
,
1
,
0
,
0
],
[
1
,
0
,
0
,
1
,
1
]])
else
:
langs_list
=
None
return
{
'input_ids'
:
inputs_list
,
'attention_mask'
:
attns_list
,
'langs'
:
langs_list
}
def
_init_weights
(
self
,
module
):
def
_init_weights
(
self
,
module
):
""" Initialize the weights. """
""" Initialize the weights. """
if
isinstance
(
module
,
nn
.
Embedding
):
if
isinstance
(
module
,
nn
.
Embedding
):
...
...
transformers/tests/modeling_common_test.py
View file @
e92bcb7e
...
@@ -58,7 +58,7 @@ else:
...
@@ -58,7 +58,7 @@ else:
def
_config_zero_init
(
config
):
def
_config_zero_init
(
config
):
configs_no_init
=
copy
.
deepcopy
(
config
)
configs_no_init
=
copy
.
deepcopy
(
config
)
for
key
in
configs_no_init
.
__dict__
.
keys
():
for
key
in
configs_no_init
.
__dict__
.
keys
():
if
'_range'
in
key
or
'_std'
in
key
:
if
'_range'
in
key
or
'_std'
in
key
or
'initializer_factor'
in
key
:
setattr
(
configs_no_init
,
key
,
0.0
)
setattr
(
configs_no_init
,
key
,
0.0
)
return
configs_no_init
return
configs_no_init
...
@@ -73,6 +73,7 @@ class CommonTestCases:
...
@@ -73,6 +73,7 @@ class CommonTestCases:
test_pruning
=
True
test_pruning
=
True
test_resize_embeddings
=
True
test_resize_embeddings
=
True
test_head_masking
=
True
test_head_masking
=
True
is_encoder_decoder
=
False
def
test_save_load
(
self
):
def
test_save_load
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -83,6 +84,8 @@ class CommonTestCases:
...
@@ -83,6 +84,8 @@ class CommonTestCases:
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
out_2
=
outputs
[
0
].
numpy
()
out_2
[
np
.
isnan
(
out_2
)]
=
0
with
TemporaryDirectory
()
as
tmpdirname
:
with
TemporaryDirectory
()
as
tmpdirname
:
model
.
save_pretrained
(
tmpdirname
)
model
.
save_pretrained
(
tmpdirname
)
...
@@ -93,9 +96,7 @@ class CommonTestCases:
...
@@ -93,9 +96,7 @@ class CommonTestCases:
# Make sure we don't have nans
# Make sure we don't have nans
out_1
=
after_outputs
[
0
].
cpu
().
numpy
()
out_1
=
after_outputs
[
0
].
cpu
().
numpy
()
out_2
=
outputs
[
0
].
cpu
().
numpy
()
out_1
[
np
.
isnan
(
out_1
)]
=
0
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
self
.
assertLessEqual
(
max_diff
,
1e-5
)
...
@@ -117,19 +118,31 @@ class CommonTestCases:
...
@@ -117,19 +118,31 @@ class CommonTestCases:
model
=
model_class
(
config
)
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
first
,
second
=
model
(
inputs_dict
[
"input_ids"
])[
0
],
model
(
inputs_dict
[
"input_ids"
])[
0
]
with
torch
.
no_grad
():
self
.
assertEqual
(
first
.
ne
(
second
).
sum
().
item
(),
0
)
first
=
model
(
**
inputs_dict
)[
0
]
second
=
model
(
**
inputs_dict
)[
0
]
out_1
=
first
.
cpu
().
numpy
()
out_2
=
second
.
cpu
().
numpy
()
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
self
.
assertLessEqual
(
max_diff
,
1e-5
)
def
test_attention_outputs
(
self
):
def
test_attention_outputs
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
decoder_seq_length
=
self
.
model_tester
.
decoder_seq_length
if
hasattr
(
self
.
model_tester
,
'decoder_seq_length'
)
else
self
.
model_tester
.
seq_length
encoder_seq_length
=
self
.
model_tester
.
encoder_seq_length
if
hasattr
(
self
.
model_tester
,
'encoder_seq_length'
)
else
self
.
model_tester
.
seq_length
decoder_key_length
=
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
'key_length'
)
else
decoder_seq_length
encoder_key_length
=
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
'key_length'
)
else
encoder_seq_length
for
model_class
in
self
.
all_model_classes
:
for
model_class
in
self
.
all_model_classes
:
config
.
output_attentions
=
True
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
config
.
output_hidden_states
=
False
model
=
model_class
(
config
)
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
...
@@ -138,28 +151,42 @@ class CommonTestCases:
...
@@ -138,28 +151,42 @@ class CommonTestCases:
self
.
assertListEqual
(
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
list
(
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
[
self
.
model_tester
.
num_attention_heads
,
self
.
model_tester
.
seq_length
,
encoder_
seq_length
,
self
.
model_tester
.
key_len
if
hasattr
(
self
.
model_tester
,
'key_len'
)
else
self
.
model_tester
.
seq
_length
])
encoder_key
_length
])
out_len
=
len
(
outputs
)
out_len
=
len
(
outputs
)
if
self
.
is_encoder_decoder
:
self
.
assertEqual
(
out_len
%
2
,
0
)
decoder_attentions
=
outputs
[(
out_len
//
2
)
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
False
)
self
.
assertEqual
(
len
(
decoder_attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
decoder_attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
decoder_seq_length
,
decoder_key_length
])
# Check attention is always last and order is fine
# Check attention is always last and order is fine
config
.
output_attentions
=
True
config
.
output_attentions
=
True
config
.
output_hidden_states
=
True
config
.
output_hidden_states
=
True
model
=
model_class
(
config
)
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
self
.
assertEqual
(
out_len
+
1
,
len
(
outputs
))
self
.
assertEqual
(
out_len
+
(
2
if
self
.
is_encoder_decoder
else
1
)
,
len
(
outputs
))
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
attentions
=
outputs
[
-
1
]
self_
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
len
(
attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertEqual
(
len
(
self_
attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
list
(
self_
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
[
self
.
model_tester
.
num_attention_heads
,
self
.
model_tester
.
seq_length
,
encoder_
seq_length
,
self
.
model_tester
.
key_len
if
hasattr
(
self
.
model_tester
,
'key_len'
)
else
self
.
model_tester
.
seq
_length
])
encoder_key
_length
])
def
test_torchscript
(
self
):
def
test_torchscript
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -223,7 +250,6 @@ class CommonTestCases:
...
@@ -223,7 +250,6 @@ class CommonTestCases:
self
.
assertTrue
(
models_equal
)
self
.
assertTrue
(
models_equal
)
def
test_headmasking
(
self
):
def
test_headmasking
(
self
):
if
not
self
.
test_head_masking
:
if
not
self
.
test_head_masking
:
return
return
...
@@ -278,7 +304,6 @@ class CommonTestCases:
...
@@ -278,7 +304,6 @@ class CommonTestCases:
self
.
assertNotEqual
(
self
.
assertNotEqual
(
attentions
[
-
1
][...,
-
1
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
attentions
[
-
1
][...,
-
1
,
:,
:].
flatten
().
sum
().
item
(),
0.0
)
def
test_head_pruning
(
self
):
def
test_head_pruning
(
self
):
if
not
self
.
test_pruning
:
if
not
self
.
test_pruning
:
return
return
...
@@ -297,6 +322,7 @@ class CommonTestCases:
...
@@ -297,6 +322,7 @@ class CommonTestCases:
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
heads_to_prune
=
{
0
:
list
(
range
(
1
,
self
.
model_tester
.
num_attention_heads
)),
-
1
:
[
0
]}
-
1
:
[
0
]}
model
.
prune_heads
(
heads_to_prune
)
model
.
prune_heads
(
heads_to_prune
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
...
@@ -333,6 +359,7 @@ class CommonTestCases:
...
@@ -333,6 +359,7 @@ class CommonTestCases:
model
=
model_class
.
from_pretrained
(
directory
)
model
=
model_class
.
from_pretrained
(
directory
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
1
)
self
.
assertEqual
(
attentions
[
0
].
shape
[
-
3
],
1
)
...
@@ -362,6 +389,7 @@ class CommonTestCases:
...
@@ -362,6 +389,7 @@ class CommonTestCases:
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
...
@@ -389,6 +417,7 @@ class CommonTestCases:
...
@@ -389,6 +417,7 @@ class CommonTestCases:
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
...
@@ -406,6 +435,7 @@ class CommonTestCases:
...
@@ -406,6 +435,7 @@ class CommonTestCases:
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
shutil
.
rmtree
(
directory
)
shutil
.
rmtree
(
directory
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
...
@@ -417,6 +447,7 @@ class CommonTestCases:
...
@@ -417,6 +447,7 @@ class CommonTestCases:
heads_to_prune
=
{
0
:
[
0
],
2
:
[
1
,
2
]}
heads_to_prune
=
{
0
:
[
0
],
2
:
[
1
,
2
]}
model
.
prune_heads
(
heads_to_prune
)
model
.
prune_heads
(
heads_to_prune
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
attentions
=
outputs
[
-
1
]
attentions
=
outputs
[
-
1
]
...
@@ -427,7 +458,6 @@ class CommonTestCases:
...
@@ -427,7 +458,6 @@ class CommonTestCases:
self
.
assertDictEqual
(
model
.
config
.
pruned_heads
,
{
0
:
[
0
],
1
:
[
1
,
2
],
2
:
[
1
,
2
]})
self
.
assertDictEqual
(
model
.
config
.
pruned_heads
,
{
0
:
[
0
],
1
:
[
1
,
2
],
2
:
[
1
,
2
]})
def
test_hidden_states_output
(
self
):
def
test_hidden_states_output
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -437,6 +467,7 @@ class CommonTestCases:
...
@@ -437,6 +467,7 @@ class CommonTestCases:
model
=
model_class
(
config
)
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
outputs
=
model
(
**
inputs_dict
)
hidden_states
=
outputs
[
-
1
]
hidden_states
=
outputs
[
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
False
)
self
.
assertEqual
(
model
.
config
.
output_attentions
,
False
)
...
@@ -444,7 +475,8 @@ class CommonTestCases:
...
@@ -444,7 +475,8 @@ class CommonTestCases:
self
.
assertEqual
(
len
(
hidden_states
),
self
.
model_tester
.
num_hidden_layers
+
1
)
self
.
assertEqual
(
len
(
hidden_states
),
self
.
model_tester
.
num_hidden_layers
+
1
)
self
.
assertListEqual
(
self
.
assertListEqual
(
list
(
hidden_states
[
0
].
shape
[
-
2
:]),
list
(
hidden_states
[
0
].
shape
[
-
2
:]),
[
self
.
model_tester
.
seq_length
,
self
.
model_tester
.
hidden_size
])
[
self
.
model_tester
.
encoder_seq_length
if
hasattr
(
self
.
model_tester
,
'encoder_seq_length'
)
else
self
.
model_tester
.
seq_length
,
self
.
model_tester
.
hidden_size
])
def
test_resize_tokens_embeddings
(
self
):
def
test_resize_tokens_embeddings
(
self
):
original_config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
original_config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -550,8 +582,14 @@ class CommonTestCases:
...
@@ -550,8 +582,14 @@ class CommonTestCases:
def
test_inputs_embeds
(
self
):
def
test_inputs_embeds
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
not
self
.
is_encoder_decoder
:
input_ids
=
inputs_dict
[
"input_ids"
]
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
else
:
encoder_input_ids
=
inputs_dict
[
"encoder_input_ids"
]
decoder_input_ids
=
inputs_dict
[
"decoder_input_ids"
]
del
inputs_dict
[
"encoder_input_ids"
]
del
inputs_dict
[
"decoder_input_ids"
]
for
model_class
in
self
.
all_model_classes
:
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
=
model_class
(
config
)
...
@@ -559,9 +597,14 @@ class CommonTestCases:
...
@@ -559,9 +597,14 @@ class CommonTestCases:
model
.
eval
()
model
.
eval
()
wte
=
model
.
get_input_embeddings
()
wte
=
model
.
get_input_embeddings
()
if
not
self
.
is_encoder_decoder
:
inputs_dict
[
"inputs_embeds"
]
=
wte
(
input_ids
)
inputs_dict
[
"inputs_embeds"
]
=
wte
(
input_ids
)
outputs
=
model
(
**
inputs_dict
)
else
:
inputs_dict
[
"encoder_inputs_embeds"
]
=
wte
(
encoder_input_ids
)
inputs_dict
[
"decoder_inputs_embeds"
]
=
wte
(
decoder_input_ids
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs_dict
)
class
GPTModelTester
(
CommonModelTester
):
class
GPTModelTester
(
CommonModelTester
):
...
@@ -649,6 +692,7 @@ class CommonTestCases:
...
@@ -649,6 +692,7 @@ class CommonTestCases:
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
)
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
)
outputs
=
model
(
input_ids
,
position_ids
)
outputs
=
model
(
input_ids
,
position_ids
)
outputs
=
model
(
input_ids
)
outputs
=
model
(
input_ids
)
...
@@ -664,6 +708,7 @@ class CommonTestCases:
...
@@ -664,6 +708,7 @@ class CommonTestCases:
model
=
self
.
lm_head_model_class
(
config
)
model
=
self
.
lm_head_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
,
lm_labels
)
outputs
=
model
(
input_ids
,
position_ids
,
token_type_ids
,
lm_labels
)
loss
,
lm_logits
=
outputs
[:
2
]
loss
,
lm_logits
=
outputs
[:
2
]
...
@@ -681,6 +726,7 @@ class CommonTestCases:
...
@@ -681,6 +726,7 @@ class CommonTestCases:
model
=
model_class
(
config
)
model
=
model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
)
outputs
=
model
(
input_ids
)
presents
=
outputs
[
-
1
]
presents
=
outputs
[
-
1
]
self
.
parent
.
assertEqual
(
self
.
num_hidden_layers
,
len
(
presents
))
self
.
parent
.
assertEqual
(
self
.
num_hidden_layers
,
len
(
presents
))
...
@@ -694,6 +740,7 @@ class CommonTestCases:
...
@@ -694,6 +740,7 @@ class CommonTestCases:
model
=
self
.
double_head_model_class
(
config
)
model
=
self
.
double_head_model_class
(
config
)
model
.
to
(
torch_device
)
model
.
to
(
torch_device
)
model
.
eval
()
model
.
eval
()
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
=
lm_labels
,
mc_labels
=
mc_labels
,
outputs
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
=
lm_labels
,
mc_labels
=
mc_labels
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
lm_loss
,
mc_loss
,
lm_logits
,
mc_logits
=
outputs
[:
4
]
lm_loss
,
mc_loss
,
lm_logits
,
mc_logits
=
outputs
[:
4
]
...
...
transformers/tests/modeling_t5_test.py
0 → 100644
View file @
e92bcb7e
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
from
transformers
import
is_torch_available
from
.modeling_common_test
import
(
CommonTestCases
,
ids_tensor
,
floats_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_torch
,
slow
,
torch_device
if
is_torch_available
():
from
transformers
import
(
T5Config
,
T5Model
,
T5WithLMHeadModel
)
from
transformers.modeling_t5
import
T5_PRETRAINED_MODEL_ARCHIVE_MAP
@
require_torch
class
T5ModelTest
(
CommonTestCases
.
CommonModelTester
):
all_model_classes
=
(
T5Model
,
T5WithLMHeadModel
)
if
is_torch_available
()
else
()
test_pruning
=
False
test_torchscript
=
False
test_resize_embeddings
=
False
is_encoder_decoder
=
True
class
T5ModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
encoder_seq_length
=
7
,
decoder_seq_length
=
9
,
is_training
=
True
,
use_attention_mask
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
n_positions
=
14
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
d_ff
=
37
,
relative_attention_num_buckets
=
8
,
dropout_rate
=
0.1
,
initializer_factor
=
0.002
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
encoder_seq_length
=
encoder_seq_length
self
.
decoder_seq_length
=
decoder_seq_length
self
.
is_training
=
is_training
self
.
use_attention_mask
=
use_attention_mask
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
n_positions
=
n_positions
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
d_ff
=
d_ff
self
.
relative_attention_num_buckets
=
relative_attention_num_buckets
self
.
dropout_rate
=
dropout_rate
self
.
initializer_factor
=
initializer_factor
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
encoder_input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
encoder_seq_length
],
self
.
vocab_size
)
decoder_input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
decoder_seq_length
],
self
.
vocab_size
)
encoder_attention_mask
=
None
decoder_attention_mask
=
None
if
self
.
use_attention_mask
:
encoder_attention_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
encoder_seq_length
],
vocab_size
=
2
)
decoder_attention_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
decoder_seq_length
],
vocab_size
=
2
)
decoder_lm_labels
=
None
if
self
.
use_labels
:
decoder_lm_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
decoder_seq_length
],
self
.
vocab_size
)
config
=
T5Config
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_positions
=
self
.
n_positions
,
d_model
=
self
.
hidden_size
,
d_ff
=
self
.
d_ff
,
d_kv
=
self
.
hidden_size
//
self
.
num_attention_heads
,
num_layers
=
self
.
num_hidden_layers
,
num_heads
=
self
.
num_attention_heads
,
relative_attention_num_buckets
=
self
.
relative_attention_num_buckets
,
dropout_rate
=
self
.
dropout_rate
,
initializer_factor
=
self
.
initializer_factor
)
return
(
config
,
encoder_input_ids
,
decoder_input_ids
,
encoder_attention_mask
,
decoder_attention_mask
,
decoder_lm_labels
)
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
[])
def
create_and_check_t5_model
(
self
,
config
,
encoder_input_ids
,
decoder_input_ids
,
encoder_attention_mask
,
decoder_attention_mask
,
decoder_lm_labels
):
model
=
T5Model
(
config
=
config
)
model
.
eval
()
decoder_output
,
encoder_output
=
model
(
encoder_input_ids
=
encoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
encoder_attention_mask
=
encoder_attention_mask
,
decoder_attention_mask
=
decoder_attention_mask
)
decoder_output
,
encoder_output
=
model
(
encoder_input_ids
=
encoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
)
result
=
{
"encoder_output"
:
encoder_output
,
"decoder_output"
:
decoder_output
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"encoder_output"
].
size
()),
[
self
.
batch_size
,
self
.
encoder_seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"decoder_output"
].
size
()),
[
self
.
batch_size
,
self
.
decoder_seq_length
,
self
.
hidden_size
])
def
create_and_check_t5_with_lm_head
(
self
,
config
,
encoder_input_ids
,
decoder_input_ids
,
encoder_attention_mask
,
decoder_attention_mask
,
decoder_lm_labels
):
model
=
T5WithLMHeadModel
(
config
=
config
)
model
.
eval
()
outputs
=
model
(
encoder_input_ids
=
encoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
decoder_attention_mask
=
decoder_attention_mask
,
decoder_lm_labels
=
decoder_lm_labels
)
loss
,
prediction_scores
=
outputs
[
0
],
outputs
[
1
]
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
size
()),
[
self
.
batch_size
,
self
.
decoder_seq_length
,
self
.
vocab_size
])
self
.
check_loss_output
(
result
)
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
encoder_input_ids
,
decoder_input_ids
,
encoder_attention_mask
,
decoder_attention_mask
,
decoder_lm_labels
)
=
config_and_inputs
inputs_dict
=
{
'encoder_input_ids'
:
encoder_input_ids
,
'decoder_input_ids'
:
decoder_input_ids
,
'decoder_attention_mask'
:
decoder_attention_mask
,
'encoder_attention_mask'
:
encoder_attention_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
T5ModelTest
.
T5ModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
T5Config
,
d_model
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_t5_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_t5_model
(
*
config_and_inputs
)
def
test_with_lm_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_t5_with_lm_head
(
*
config_and_inputs
)
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
list
(
T5_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
T5Model
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_common_test.py
View file @
e92bcb7e
...
@@ -69,6 +69,7 @@ class TFCommonTestCases:
...
@@ -69,6 +69,7 @@ class TFCommonTestCases:
test_torchscript
=
True
test_torchscript
=
True
test_pruning
=
True
test_pruning
=
True
test_resize_embeddings
=
True
test_resize_embeddings
=
True
is_encoder_decoder
=
False
def
test_initialization
(
self
):
def
test_initialization
(
self
):
pass
pass
...
@@ -129,8 +130,12 @@ class TFCommonTestCases:
...
@@ -129,8 +130,12 @@ class TFCommonTestCases:
for
name
,
key
in
inputs_dict
.
items
())
for
name
,
key
in
inputs_dict
.
items
())
with
torch
.
no_grad
():
with
torch
.
no_grad
():
pto
=
pt_model
(
**
pt_inputs_dict
)
pto
=
pt_model
(
**
pt_inputs_dict
)
tfo
=
tf_model
(
inputs_dict
)
tfo
=
tf_model
(
inputs_dict
,
training
=
False
)
max_diff
=
np
.
amax
(
np
.
abs
(
tfo
[
0
].
numpy
()
-
pto
[
0
].
numpy
()))
tf_hidden_states
=
tfo
[
0
].
numpy
()
pt_hidden_states
=
pto
[
0
].
numpy
()
tf_hidden_states
[
np
.
isnan
(
tf_hidden_states
)]
=
0
pt_hidden_states
[
np
.
isnan
(
pt_hidden_states
)]
=
0
max_diff
=
np
.
amax
(
np
.
abs
(
tf_hidden_states
-
pt_hidden_states
))
self
.
assertLessEqual
(
max_diff
,
2e-2
)
self
.
assertLessEqual
(
max_diff
,
2e-2
)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
...
@@ -150,12 +155,20 @@ class TFCommonTestCases:
...
@@ -150,12 +155,20 @@ class TFCommonTestCases:
with
torch
.
no_grad
():
with
torch
.
no_grad
():
pto
=
pt_model
(
**
pt_inputs_dict
)
pto
=
pt_model
(
**
pt_inputs_dict
)
tfo
=
tf_model
(
inputs_dict
)
tfo
=
tf_model
(
inputs_dict
)
max_diff
=
np
.
amax
(
np
.
abs
(
tfo
[
0
].
numpy
()
-
pto
[
0
].
numpy
()))
tfo
=
tfo
[
0
].
numpy
()
pto
=
pto
[
0
].
numpy
()
tfo
[
np
.
isnan
(
tfo
)]
=
0
pto
[
np
.
isnan
(
pto
)]
=
0
max_diff
=
np
.
amax
(
np
.
abs
(
tfo
-
pto
))
self
.
assertLessEqual
(
max_diff
,
2e-2
)
self
.
assertLessEqual
(
max_diff
,
2e-2
)
def
test_compile_tf_model
(
self
):
def
test_compile_tf_model
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
if
self
.
is_encoder_decoder
:
input_ids
=
{
'decoder_input_ids'
:
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
'decoder_input_ids'
,
dtype
=
'int32'
),
'encoder_input_ids'
:
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
'encoder_input_ids'
,
dtype
=
'int32'
)}
else
:
input_ids
=
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
'input_ids'
,
dtype
=
'int32'
)
input_ids
=
tf
.
keras
.
Input
(
batch_shape
=
(
2
,
2000
),
name
=
'input_ids'
,
dtype
=
'int32'
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
...
@@ -189,7 +202,7 @@ class TFCommonTestCases:
...
@@ -189,7 +202,7 @@ class TFCommonTestCases:
outputs_dict
=
model
(
inputs_dict
)
outputs_dict
=
model
(
inputs_dict
)
inputs_keywords
=
copy
.
deepcopy
(
inputs_dict
)
inputs_keywords
=
copy
.
deepcopy
(
inputs_dict
)
input_ids
=
inputs_keywords
.
pop
(
'input_ids'
)
input_ids
=
inputs_keywords
.
pop
(
'input_ids'
if
not
self
.
is_encoder_decoder
else
'decoder_input_ids'
,
None
)
outputs_keywords
=
model
(
input_ids
,
**
inputs_keywords
)
outputs_keywords
=
model
(
input_ids
,
**
inputs_keywords
)
output_dict
=
outputs_dict
[
0
].
numpy
()
output_dict
=
outputs_dict
[
0
].
numpy
()
...
@@ -200,6 +213,11 @@ class TFCommonTestCases:
...
@@ -200,6 +213,11 @@ class TFCommonTestCases:
def
test_attention_outputs
(
self
):
def
test_attention_outputs
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
decoder_seq_length
=
self
.
model_tester
.
decoder_seq_length
if
hasattr
(
self
.
model_tester
,
'decoder_seq_length'
)
else
self
.
model_tester
.
seq_length
encoder_seq_length
=
self
.
model_tester
.
encoder_seq_length
if
hasattr
(
self
.
model_tester
,
'encoder_seq_length'
)
else
self
.
model_tester
.
seq_length
decoder_key_length
=
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
'key_length'
)
else
decoder_seq_length
encoder_key_length
=
self
.
model_tester
.
key_length
if
hasattr
(
self
.
model_tester
,
'key_length'
)
else
encoder_seq_length
for
model_class
in
self
.
all_model_classes
:
for
model_class
in
self
.
all_model_classes
:
config
.
output_attentions
=
True
config
.
output_attentions
=
True
config
.
output_hidden_states
=
False
config
.
output_hidden_states
=
False
...
@@ -212,16 +230,28 @@ class TFCommonTestCases:
...
@@ -212,16 +230,28 @@ class TFCommonTestCases:
self
.
assertListEqual
(
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
list
(
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
[
self
.
model_tester
.
num_attention_heads
,
self
.
model_tester
.
seq_length
,
encoder_
seq_length
,
self
.
model_tester
.
key_len
if
hasattr
(
self
.
model_tester
,
'key_len'
)
else
self
.
model_tester
.
seq
_length
])
encoder_key
_length
])
out_len
=
len
(
outputs
)
out_len
=
len
(
outputs
)
if
self
.
is_encoder_decoder
:
self
.
assertEqual
(
out_len
%
2
,
0
)
decoder_attentions
=
outputs
[(
out_len
//
2
)
-
1
]
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
False
)
self
.
assertEqual
(
len
(
decoder_attentions
),
self
.
model_tester
.
num_hidden_layers
)
self
.
assertListEqual
(
list
(
decoder_attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
decoder_seq_length
,
decoder_key_length
])
# Check attention is always last and order is fine
# Check attention is always last and order is fine
config
.
output_attentions
=
True
config
.
output_attentions
=
True
config
.
output_hidden_states
=
True
config
.
output_hidden_states
=
True
model
=
model_class
(
config
)
model
=
model_class
(
config
)
outputs
=
model
(
inputs_dict
)
outputs
=
model
(
inputs_dict
)
self
.
assertEqual
(
out_len
+
1
,
len
(
outputs
))
self
.
assertEqual
(
out_len
+
(
2
if
self
.
is_encoder_decoder
else
1
)
,
len
(
outputs
))
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_attentions
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
self
.
assertEqual
(
model
.
config
.
output_hidden_states
,
True
)
...
@@ -230,8 +260,8 @@ class TFCommonTestCases:
...
@@ -230,8 +260,8 @@ class TFCommonTestCases:
self
.
assertListEqual
(
self
.
assertListEqual
(
list
(
attentions
[
0
].
shape
[
-
3
:]),
list
(
attentions
[
0
].
shape
[
-
3
:]),
[
self
.
model_tester
.
num_attention_heads
,
[
self
.
model_tester
.
num_attention_heads
,
self
.
model_tester
.
seq_length
,
encoder_
seq_length
,
self
.
model_tester
.
key_len
if
hasattr
(
self
.
model_tester
,
'key_len'
)
else
self
.
model_tester
.
seq
_length
])
encoder_key
_length
])
def
test_hidden_states_output
(
self
):
def
test_hidden_states_output
(
self
):
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
...
@@ -264,17 +294,17 @@ class TFCommonTestCases:
...
@@ -264,17 +294,17 @@ class TFCommonTestCases:
for
model_class
in
self
.
all_model_classes
:
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
model
=
model_class
(
config
)
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
first
,
second
=
model
(
inputs_dict
,
training
=
False
)[
0
],
model
(
inputs_dict
,
training
=
False
)[
0
]
self
.
assertTrue
(
tf
.
math
.
equal
(
first
,
second
).
numpy
().
all
())
out_1
=
first
.
numpy
()
out_2
=
second
.
numpy
()
def
test_inputs_embeds
(
self
):
out_1
=
out_1
[
~
np
.
isnan
(
out_1
)]
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
out_2
=
out_2
[
~
np
.
isnan
(
out_2
)]
input_ids
=
inputs_dict
[
"input_ids"
]
max_diff
=
np
.
amax
(
np
.
abs
(
out_1
-
out_2
))
del
inputs_dict
[
"input_ids"
]
self
.
assertLessEqual
(
max_diff
,
1e-5
)
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
wte
=
model
.
get_input_embeddings
()
def
_get_embeds
(
self
,
wte
,
input_ids
):
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
try
:
try
:
x
=
wte
(
input_ids
,
mode
=
"embedding"
)
x
=
wte
(
input_ids
,
mode
=
"embedding"
)
except
:
except
:
...
@@ -288,11 +318,29 @@ class TFCommonTestCases:
...
@@ -288,11 +318,29 @@ class TFCommonTestCases:
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
embedding_size
],
dtype
=
tf
.
dtypes
.
float32
)
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
embedding_size
],
dtype
=
tf
.
dtypes
.
float32
)
else
:
else
:
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
hidden_size
],
dtype
=
tf
.
dtypes
.
float32
)
x
=
tf
.
ones
(
input_ids
.
shape
+
[
self
.
model_tester
.
hidden_size
],
dtype
=
tf
.
dtypes
.
float32
)
# ^^ In our TF models, the input_embeddings can take slightly different forms,
return
x
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
def
test_inputs_embeds
(
self
):
#
config
,
inputs_dict
=
self
.
model_tester
.
prepare_config_and_inputs_for_common
()
inputs_dict
[
"inputs_embeds"
]
=
x
if
not
self
.
is_encoder_decoder
:
input_ids
=
inputs_dict
[
"input_ids"
]
del
inputs_dict
[
"input_ids"
]
else
:
encoder_input_ids
=
inputs_dict
[
"encoder_input_ids"
]
decoder_input_ids
=
inputs_dict
[
"decoder_input_ids"
]
del
inputs_dict
[
"encoder_input_ids"
]
del
inputs_dict
[
"decoder_input_ids"
]
for
model_class
in
self
.
all_model_classes
:
model
=
model_class
(
config
)
wte
=
model
.
get_input_embeddings
()
if
not
self
.
is_encoder_decoder
:
inputs_dict
[
"inputs_embeds"
]
=
self
.
_get_embeds
(
wte
,
input_ids
)
else
:
inputs_dict
[
"encoder_inputs_embeds"
]
=
self
.
_get_embeds
(
wte
,
encoder_input_ids
)
inputs_dict
[
"decoder_inputs_embeds"
]
=
self
.
_get_embeds
(
wte
,
decoder_input_ids
)
outputs
=
model
(
inputs_dict
)
outputs
=
model
(
inputs_dict
)
...
...
transformers/tests/modeling_tf_t5_test.py
0 → 100644
View file @
e92bcb7e
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
.utils
import
require_tf
,
slow
from
transformers
import
T5Config
,
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
from
transformers.modeling_tf_t5
import
(
TFT5Model
,
TFT5WithLMHeadModel
,
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
)
@
require_tf
class
TFT5ModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
is_encoder_decoder
=
True
all_model_classes
=
(
TFT5Model
,
TFT5WithLMHeadModel
)
if
is_tf_available
()
else
()
class
TFT5ModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_input_mask
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
n_positions
=
14
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
d_ff
=
37
,
relative_attention_num_buckets
=
8
,
dropout_rate
=
0.1
,
initializer_factor
=
0.002
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_input_mask
=
use_input_mask
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
n_positions
=
n_positions
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
d_ff
=
d_ff
self
.
relative_attention_num_buckets
=
relative_attention_num_buckets
self
.
dropout_rate
=
dropout_rate
self
.
initializer_factor
=
initializer_factor
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_labels
=
None
if
self
.
use_labels
:
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
config
=
T5Config
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_positions
=
self
.
n_positions
,
d_model
=
self
.
hidden_size
,
d_ff
=
self
.
d_ff
,
d_kv
=
self
.
hidden_size
//
self
.
num_attention_heads
,
num_layers
=
self
.
num_hidden_layers
,
num_heads
=
self
.
num_attention_heads
,
relative_attention_num_buckets
=
self
.
relative_attention_num_buckets
,
dropout_rate
=
self
.
dropout_rate
,
initializer_factor
=
self
.
initializer_factor
)
return
(
config
,
input_ids
,
input_mask
,
token_labels
)
def
create_and_check_t5_model
(
self
,
config
,
input_ids
,
input_mask
,
token_labels
):
model
=
TFT5Model
(
config
=
config
)
inputs
=
{
'encoder_input_ids'
:
input_ids
,
'decoder_input_ids'
:
input_ids
,
'decoder_attention_mask'
:
input_mask
}
encoder_output
,
decoder_output
=
model
(
inputs
)
encoder_output
,
decoder_output
=
model
(
input_ids
,
decoder_attention_mask
=
input_mask
,
encoder_input_ids
=
input_ids
)
result
=
{
"encoder_output"
:
encoder_output
.
numpy
(),
"decoder_output"
:
decoder_output
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"encoder_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"decoder_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
def
create_and_check_t5_with_lm_head
(
self
,
config
,
input_ids
,
input_mask
,
token_labels
):
model
=
TFT5WithLMHeadModel
(
config
=
config
)
inputs
=
{
'encoder_input_ids'
:
input_ids
,
'decoder_input_ids'
:
input_ids
,
'decoder_attention_mask'
:
input_mask
}
prediction_scores
,
decoder_output
=
model
(
inputs
)
result
=
{
"prediction_scores"
:
prediction_scores
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
token_labels
)
=
config_and_inputs
inputs_dict
=
{
'encoder_input_ids'
:
input_ids
,
'decoder_input_ids'
:
input_ids
,
'decoder_attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFT5ModelTest
.
TFT5ModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
T5Config
,
d_model
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_t5_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_t5_model
(
*
config_and_inputs
)
def
test_with_lm_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_t5_with_lm_head
(
*
config_and_inputs
)
@
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/transformers_test/"
for
model_name
in
[
't5-small'
]:
model
=
TFT5Model
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
transformers/tests/modeling_tf_transfo_xl_test.py
View file @
e92bcb7e
...
@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
self
.
batch_size
=
batch_size
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
seq_length
=
seq_length
self
.
mem_len
=
mem_len
self
.
mem_len
=
mem_len
self
.
key_len
=
seq_length
+
mem_len
self
.
key_len
gth
=
seq_length
+
mem_len
self
.
clamp_len
=
clamp_len
self
.
clamp_len
=
clamp_len
self
.
is_training
=
is_training
self
.
is_training
=
is_training
self
.
use_labels
=
use_labels
self
.
use_labels
=
use_labels
...
...
transformers/tests/modeling_transfo_xl_test.py
View file @
e92bcb7e
...
@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
...
@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
self
.
batch_size
=
batch_size
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
seq_length
=
seq_length
self
.
mem_len
=
mem_len
self
.
mem_len
=
mem_len
self
.
key_len
=
seq_length
+
mem_len
self
.
key_len
gth
=
seq_length
+
mem_len
self
.
clamp_len
=
clamp_len
self
.
clamp_len
=
clamp_len
self
.
is_training
=
is_training
self
.
is_training
=
is_training
self
.
use_labels
=
use_labels
self
.
use_labels
=
use_labels
...
...
transformers/tests/tokenization_t5_test.py
0 → 100644
View file @
e92bcb7e
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_t5
import
(
T5Tokenizer
)
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
from
.tokenization_tests_commons
import
CommonTestCases
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'fixtures/test_sentencepiece.model'
)
class
T5TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
T5Tokenizer
def
setUp
(
self
):
super
(
T5TokenizationTest
,
self
).
setUp
()
# We have a SentencePiece fixture for testing
tokenizer
=
T5Tokenizer
(
SAMPLE_VOCAB
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
T5Tokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"This is a test"
output_text
=
u
"This is a test"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
T5Tokenizer
(
SAMPLE_VOCAB
)
tokens
=
tokenizer
.
tokenize
(
u
'This is a test'
)
self
.
assertListEqual
(
tokens
,
[
u
'▁This'
,
u
'▁is'
,
u
'▁a'
,
u
'▁t'
,
u
'est'
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
285
,
46
,
10
,
170
,
382
])
tokens
=
tokenizer
.
tokenize
(
u
"I was born in 92000, and this is falsé."
)
self
.
assertListEqual
(
tokens
,
[
SPIECE_UNDERLINE
+
u
'I'
,
SPIECE_UNDERLINE
+
u
'was'
,
SPIECE_UNDERLINE
+
u
'b'
,
u
'or'
,
u
'n'
,
SPIECE_UNDERLINE
+
u
'in'
,
SPIECE_UNDERLINE
+
u
''
,
u
'9'
,
u
'2'
,
u
'0'
,
u
'0'
,
u
'0'
,
u
','
,
SPIECE_UNDERLINE
+
u
'and'
,
SPIECE_UNDERLINE
+
u
'this'
,
SPIECE_UNDERLINE
+
u
'is'
,
SPIECE_UNDERLINE
+
u
'f'
,
u
'al'
,
u
's'
,
u
'é'
,
u
'.'
])
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
self
.
assertListEqual
(
ids
,
[
8
,
21
,
84
,
55
,
24
,
19
,
7
,
0
,
602
,
347
,
347
,
347
,
3
,
12
,
66
,
46
,
72
,
80
,
6
,
0
,
4
])
back_tokens
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
self
.
assertListEqual
(
back_tokens
,
[
SPIECE_UNDERLINE
+
u
'I'
,
SPIECE_UNDERLINE
+
u
'was'
,
SPIECE_UNDERLINE
+
u
'b'
,
u
'or'
,
u
'n'
,
SPIECE_UNDERLINE
+
u
'in'
,
SPIECE_UNDERLINE
+
u
''
,
u
'<unk>'
,
u
'2'
,
u
'0'
,
u
'0'
,
u
'0'
,
u
','
,
SPIECE_UNDERLINE
+
u
'and'
,
SPIECE_UNDERLINE
+
u
'this'
,
SPIECE_UNDERLINE
+
u
'is'
,
SPIECE_UNDERLINE
+
u
'f'
,
u
'al'
,
u
's'
,
u
'<unk>'
,
u
'.'
])
if
__name__
==
'__main__'
:
unittest
.
main
()
transformers/tokenization_auto.py
View file @
e92bcb7e
...
@@ -30,6 +30,7 @@ from .tokenization_roberta import RobertaTokenizer
...
@@ -30,6 +30,7 @@ from .tokenization_roberta import RobertaTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_distilbert
import
DistilBertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
from
.tokenization_camembert
import
CamembertTokenizer
from
.tokenization_albert
import
AlbertTokenizer
from
.tokenization_albert
import
AlbertTokenizer
from
.tokenization_t5
import
T5Tokenizer
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -44,6 +45,7 @@ class AutoTokenizer(object):
...
@@ -44,6 +45,7 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Tokenizer (T5 model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
...
@@ -69,6 +71,7 @@ class AutoTokenizer(object):
...
@@ -69,6 +71,7 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Tokenizer (T5 model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
...
@@ -119,7 +122,9 @@ class AutoTokenizer(object):
...
@@ -119,7 +122,9 @@ class AutoTokenizer(object):
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
"""
"""
if
'distilbert'
in
pretrained_model_name_or_path
:
if
't5'
in
pretrained_model_name_or_path
:
return
T5Tokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'distilbert'
in
pretrained_model_name_or_path
:
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
DistilBertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'albert'
in
pretrained_model_name_or_path
:
elif
'albert'
in
pretrained_model_name_or_path
:
return
AlbertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
AlbertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
...
...
transformers/tokenization_t5.py
0 → 100644
View file @
e92bcb7e
# coding=utf-8
# Copyright 2018 T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model T5."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
logging
import
os
import
re
import
six
from
shutil
import
copyfile
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
SPIECE_UNDERLINE
=
u
'▁'
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'spiece.model'
}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model shortcut names.
####################################################
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
't5-small'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model"
,
't5-base'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model"
,
't5-large'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model"
,
't5-3b'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model"
,
't5-11b'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model"
,
}
}
####################################################
# Mapping from model shortcut names to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
't5-small'
:
512
,
't5-base'
:
512
,
't5-large'
:
512
,
't5-3b'
:
512
,
't5-11b'
:
512
,
}
class
T5Tokenizer
(
PreTrainedTokenizer
):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
- `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
(like in T5 preprocessing
see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
eos_token
=
"</s>"
,
unk_token
=
"<unk>"
,
pad_token
=
"<pad>"
,
extra_ids
=
100
,
additional_special_tokens
=
None
,
**
kwargs
):
# Add extra_ids to the special token list
if
extra_ids
>
0
:
if
additional_special_tokens
is
None
:
additional_special_tokens
=
[]
additional_special_tokens
.
extend
([
u
"<extra_id_{}>"
.
format
(
i
)
for
i
in
range
(
extra_ids
)])
super
(
T5Tokenizer
,
self
).
__init__
(
eos_token
=
eos_token
,
unk_token
=
unk_token
,
pad_token
=
pad_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use T5Tokenizer:"
"https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
vocab_file
=
vocab_file
self
.
_extra_ids
=
extra_ids
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
vocab_file
)
@
property
def
vocab_size
(
self
):
return
self
.
sp_model
.
get_piece_size
()
+
self
.
_extra_ids
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
_tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Take as input a string and return a list of strings (tokens) for words/sub-words
"""
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
# convert back to unicode for py2
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
pieces
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
'utf-8'
)
ret_pieces
.
append
(
piece
)
pieces
=
ret_pieces
return
pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
if
token
.
startswith
(
u
"<extra_id_"
):
l
=
re
.
match
(
r
'<extra_id_(\d+)>'
,
token
)
num
=
int
(
l
.
group
(
1
))
return
self
.
vocab_size
-
num
-
1
return
self
.
sp_model
.
piece_to_id
(
token
)
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
if
index
<
self
.
sp_model
.
get_piece_size
():
token
=
self
.
sp_model
.
IdToPiece
(
index
)
else
:
token
=
u
"<extra_id_{}>"
.
format
(
self
.
vocab_size
-
1
-
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
'utf-8'
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
""" Converts a sequence of tokens (string) in a single string. """
out_string
=
self
.
sp_model
.
decode_pieces
(
tokens
)
return
out_string
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment