Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
4060d685
"tests/test_modeling_unispeech.py" did not exist on "d5ff69fce92bb1aab9273d674e762a8eddcb2e3f"
Unverified
Commit
4060d685
authored
Apr 20, 2023
by
Joao Gante
Committed by
GitHub
Apr 20, 2023
Browse files
XGLM: Fix left-padding (PT and TF) (#22828)
parent
474bf508
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
165 additions
and
263 deletions
+165
-263
src/transformers/models/xglm/modeling_flax_xglm.py
src/transformers/models/xglm/modeling_flax_xglm.py
+0
-12
src/transformers/models/xglm/modeling_tf_xglm.py
src/transformers/models/xglm/modeling_tf_xglm.py
+25
-18
src/transformers/models/xglm/modeling_xglm.py
src/transformers/models/xglm/modeling_xglm.py
+65
-133
tests/models/xglm/test_modeling_tf_xglm.py
tests/models/xglm/test_modeling_tf_xglm.py
+36
-64
tests/models/xglm/test_modeling_xglm.py
tests/models/xglm/test_modeling_xglm.py
+39
-36
No files found.
src/transformers/models/xglm/modeling_flax_xglm.py
View file @
4060d685
...
...
@@ -124,18 +124,6 @@ def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
return
jnp
.
array
(
emb
)
def
shift_tokens_right
(
input_ids
:
jnp
.
ndarray
,
pad_token_id
:
int
,
decoder_start_token_id
:
int
)
->
jnp
.
ndarray
:
"""
Shift input ids one token to the right.
"""
shifted_input_ids
=
jnp
.
roll
(
input_ids
,
1
,
axis
=-
1
)
shifted_input_ids
=
shifted_input_ids
.
at
[(...,
0
)].
set
(
decoder_start_token_id
)
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids
=
jnp
.
where
(
shifted_input_ids
==
-
100
,
pad_token_id
,
shifted_input_ids
)
return
shifted_input_ids
class
FlaxXGLMAttention
(
nn
.
Module
):
config
:
XGLMConfig
embed_dim
:
int
...
...
src/transformers/models/xglm/modeling_tf_xglm.py
View file @
4060d685
...
...
@@ -476,19 +476,8 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
return
combined_attention_mask
def
embed_positions
(
self
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
inputs_embeds
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
past_key_values_length
:
Optional
[
int
]
=
None
,
)
->
tf
.
Tensor
:
if
input_ids
is
not
None
:
position_ids
=
_create_position_ids_from_input_ids
(
input_ids
,
past_key_values_length
,
self
.
padding_idx
)
else
:
position_ids
=
_create_position_ids_from_inputs_embeds
(
inputs_embeds
,
past_key_values_length
,
self
.
padding_idx
)
def
embed_positions
(
self
,
position_ids
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
)
->
tf
.
Tensor
:
position_ids
+=
self
.
offset
positions
=
tf
.
gather
(
self
.
_embed_positions_weights
,
position_ids
,
axis
=
0
)
return
positions
...
...
@@ -497,6 +486,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
self
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
attention_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
position_ids
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
encoder_hidden_states
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
encoder_attention_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
head_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
...
...
@@ -528,9 +518,14 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
# past_key_values_length
past_key_values_length
=
past_key_values
[
0
][
0
].
shape
[
2
]
if
past_key_values
is
not
None
else
0
if
position_ids
is
None
:
position_ids
=
tf
.
expand_dims
(
tf
.
range
(
past_key_values_length
,
input_shape
[
-
1
]
+
past_key_values_length
),
axis
=
0
)
position_ids
=
tf
.
reshape
(
position_ids
,
[
-
1
,
shape_list
(
position_ids
)[
-
1
]])
if
inputs_embeds
is
None
:
# Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
# indices on GPU, returning zeros instead. This is a dangerous silent behavior.
...
...
@@ -552,7 +547,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
encoder_attention_mask
=
_expand_mask
(
encoder_attention_mask
,
tgt_len
=
input_shape
[
-
1
])
# embed positions
positions
=
self
.
embed_positions
(
input_ids
,
inputs_embeds
,
past_key_values_length
)
positions
=
self
.
embed_positions
(
position_ids
)
hidden_states
=
tf
.
cast
(
inputs_embeds
,
dtype
=
tf
.
float32
)
+
positions
...
...
@@ -713,6 +708,11 @@ XGLM_INPUTS_DOCSTRING = r"""
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder.
...
...
@@ -796,6 +796,7 @@ class TFXGLMModel(TFXGLMPreTrainedModel):
self
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
attention_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
position_ids
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
encoder_hidden_states
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
encoder_attention_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
head_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
...
...
@@ -876,9 +877,6 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
name
=
"lm_head"
,
)
# TODO (Joao): investigate why XGLM has numerical issues in XLA generate
self
.
supports_xla_generation
=
False
def
get_output_embeddings
(
self
):
return
self
.
lm_head
...
...
@@ -890,11 +888,18 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
if
past_key_values
:
inputs
=
tf
.
expand_dims
(
inputs
[:,
-
1
],
-
1
)
position_ids
=
kwargs
.
get
(
"position_ids"
,
None
)
attention_mask
=
kwargs
.
get
(
"attention_mask"
,
None
)
if
attention_mask
is
not
None
and
position_ids
is
None
:
position_ids
=
tf
.
math
.
cumsum
(
attention_mask
,
axis
=-
1
,
exclusive
=
True
)
if
past_key_values
:
position_ids
=
tf
.
expand_dims
(
position_ids
[:,
-
1
],
-
1
)
return
{
"input_ids"
:
inputs
,
"attention_mask"
:
attention_mask
,
"position_ids"
:
position_ids
,
"past_key_values"
:
past_key_values
,
"use_cache"
:
use_cache
,
}
...
...
@@ -911,6 +916,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
self
,
input_ids
:
Optional
[
TFModelInputType
]
=
None
,
attention_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
position_ids
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
encoder_hidden_states
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
encoder_attention_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
head_mask
:
Optional
[
Union
[
np
.
ndarray
,
tf
.
Tensor
]]
=
None
,
...
...
@@ -935,6 +941,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
outputs
=
self
.
model
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
,
head_mask
=
head_mask
,
...
...
src/transformers/models/xglm/modeling_xglm.py
View file @
4060d685
...
...
@@ -75,11 +75,34 @@ XGLM_INPUTS_DOCSTRING = r"""
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
...
...
@@ -88,20 +111,12 @@ XGLM_INPUTS_DOCSTRING = r"""
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to
directly pass an embedded representation. This is useful if you want more control over how to convert
`input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
`past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
`past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
associated vectors than the model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
`(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
can choose to directly pass an embedded representation. This is useful if you want more control over how to
convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
...
...
@@ -146,18 +161,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
return
inverted_mask
.
masked_fill
(
inverted_mask
.
to
(
torch
.
bool
),
torch
.
finfo
(
dtype
).
min
)
def
create_position_ids_from_input_ids
(
input_ids
,
padding_idx
,
past_key_values_length
=
0
):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
"""
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
mask
=
input_ids
.
ne
(
padding_idx
).
int
()
incremental_indices
=
(
torch
.
cumsum
(
mask
,
dim
=
1
).
type_as
(
mask
)
+
past_key_values_length
)
*
mask
return
incremental_indices
.
long
()
+
padding_idx
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->XGLM
class
XGLMSinusoidalPositionalEmbedding
(
nn
.
Module
):
"""This module produces sinusoidal positional embeddings of any length."""
...
...
@@ -198,43 +201,17 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module):
return
emb
.
to
(
torch
.
get_default_dtype
())
@
torch
.
no_grad
()
def
forward
(
self
,
input_ids
:
torch
.
Tensor
=
None
,
inputs_embeds
:
torch
.
Tensor
=
None
,
past_key_values_length
:
int
=
0
):
if
input_ids
is
not
None
:
bsz
,
seq_len
=
input_ids
.
size
()
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids
=
create_position_ids_from_input_ids
(
input_ids
,
self
.
padding_idx
,
past_key_values_length
).
to
(
input_ids
.
device
)
else
:
bsz
,
seq_len
=
inputs_embeds
.
size
()[:
-
1
]
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
,
past_key_values_length
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
=
None
,
past_key_values_length
:
int
=
0
):
bsz
,
seq_len
=
position_ids
.
size
()
position_ids
+=
self
.
offset
#
e
xpand embeddings if needed
max_pos
=
self
.
padding_idx
+
1
+
seq_len
+
past_key_values_length
#
E
xpand embeddings if needed
. `position_ids.max()` is NOT used to keep torch.fx compatibility.
max_pos
=
2
+
seq_len
+
past_key_values_length
if
max_pos
>
self
.
weights
.
size
(
0
):
self
.
make_weights
(
max_pos
+
self
.
offset
,
self
.
embedding_dim
,
self
.
padding_idx
)
self
.
make_weights
(
max_pos
,
self
.
embedding_dim
,
self
.
padding_idx
)
return
self
.
weights
.
index_select
(
0
,
position_ids
.
view
(
-
1
)).
view
(
bsz
,
seq_len
,
self
.
weights
.
shape
[
-
1
]).
detach
()
def
create_position_ids_from_inputs_embeds
(
self
,
inputs_embeds
,
past_key_values_length
):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape
=
inputs_embeds
.
size
()[:
-
1
]
sequence_length
=
input_shape
[
1
]
position_ids
=
torch
.
arange
(
self
.
padding_idx
+
1
,
sequence_length
+
self
.
padding_idx
+
1
,
dtype
=
torch
.
long
,
device
=
inputs_embeds
.
device
)
return
position_ids
.
unsqueeze
(
0
).
expand
(
input_shape
).
contiguous
()
+
past_key_values_length
class
XGLMAttention
(
nn
.
Module
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
...
...
@@ -605,6 +582,7 @@ class XGLMModel(XGLMPreTrainedModel):
self
,
input_ids
:
Optional
[
torch
.
Tensor
]
=
None
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
position_ids
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_hidden_states
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
head_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
...
...
@@ -616,70 +594,6 @@ class XGLMModel(XGLMPreTrainedModel):
output_hidden_states
:
Optional
[
bool
]
=
None
,
return_dict
:
Optional
[
bool
]
=
None
,
)
->
Union
[
Tuple
[
torch
.
Tensor
],
BaseModelOutputWithPastAndCrossAttentions
]:
r
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
control over how to convert `input_ids` indices into associated vectors than the model's internal
embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
output_hidden_states
=
(
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
...
...
@@ -698,9 +612,19 @@ class XGLMModel(XGLMPreTrainedModel):
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
# past_key_values_length
past_key_values_length
=
past_key_values
[
0
][
0
].
shape
[
2
]
if
past_key_values
is
not
None
else
0
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
past_key_values_length
,
input_shape
[
-
1
]
+
past_key_values_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
if
input_ids
is
not
None
else
inputs_embeds
.
device
,
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
view
(
-
1
,
input_shape
[
-
1
])
else
:
position_ids
=
position_ids
.
view
(
-
1
,
input_shape
[
-
1
])
if
inputs_embeds
is
None
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
*
self
.
embed_scale
...
...
@@ -713,11 +637,7 @@ class XGLMModel(XGLMPreTrainedModel):
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
encoder_attention_mask
=
_expand_mask
(
encoder_attention_mask
,
inputs_embeds
.
dtype
,
tgt_len
=
input_shape
[
-
1
])
# embed positions
positions
=
self
.
embed_positions
(
input_ids
,
inputs_embeds
,
past_key_values_length
)
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
inputs_embeds
+
self
.
embed_positions
(
position_ids
,
past_key_values_length
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
float
(
self
.
dropout
),
training
=
self
.
training
)
if
self
.
gradient_checkpointing
and
self
.
training
:
...
...
@@ -866,6 +786,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
self
,
input_ids
:
Optional
[
torch
.
Tensor
]
=
None
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
position_ids
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_hidden_states
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
head_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
...
...
@@ -895,6 +816,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
outputs
=
self
.
model
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
encoder_hidden_states
=
encoder_hidden_states
,
encoder_attention_mask
=
encoder_attention_mask
,
head_mask
=
head_mask
,
...
...
@@ -935,6 +857,15 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
def
prepare_inputs_for_generation
(
self
,
input_ids
,
past_key_values
=
None
,
attention_mask
=
None
,
use_cache
=
None
,
**
kwargs
):
position_ids
=
kwargs
.
get
(
"position_ids"
,
None
)
if
attention_mask
is
not
None
and
position_ids
is
None
:
# create position_ids on the fly for batch generation
position_ids
=
attention_mask
.
long
().
cumsum
(
-
1
)
-
1
position_ids
.
masked_fill_
(
attention_mask
==
0
,
1
)
if
past_key_values
:
position_ids
=
position_ids
[:,
-
1
].
unsqueeze
(
-
1
)
else
:
position_ids
=
None
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
if
attention_mask
is
None
:
attention_mask
=
input_ids
.
new_ones
(
input_ids
.
shape
)
...
...
@@ -945,6 +876,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
return
{
"input_ids"
:
input_ids
,
# encoder_outputs is defined. input_ids not needed
"attention_mask"
:
attention_mask
,
"position_ids"
:
position_ids
,
"past_key_values"
:
past_key_values
,
"use_cache"
:
use_cache
,
}
...
...
tests/models/xglm/test_modeling_tf_xglm.py
View file @
4060d685
...
...
@@ -175,44 +175,6 @@ class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
name
=
model
.
get_bias
()
assert
name
is
None
@
slow
def
test_batch_generation
(
self
):
model
=
TFXGLMForCausalLM
.
from_pretrained
(
"facebook/xglm-564M"
)
tokenizer
=
XGLMTokenizer
.
from_pretrained
(
"facebook/xglm-564M"
)
tokenizer
.
padding_side
=
"left"
# use different length sentences to test batching
sentences
=
[
"Hello, my dog is a little"
,
"Today, I"
,
]
inputs
=
tokenizer
(
sentences
,
return_tensors
=
"tf"
,
padding
=
True
)
outputs
=
model
.
generate
(
input_ids
=
inputs
[
"input_ids"
],
attention_mask
=
inputs
[
"attention_mask"
])
inputs_non_padded
=
tokenizer
(
sentences
[
0
],
return_tensors
=
"tf"
).
input_ids
output_non_padded
=
model
.
generate
(
input_ids
=
inputs_non_padded
)
num_paddings
=
(
inputs_non_padded
.
shape
[
-
1
]
-
tf
.
math
.
reduce_sum
(
tf
.
cast
(
inputs
[
"attention_mask"
][
-
1
],
dtype
=
tf
.
int64
)).
numpy
()
)
inputs_padded
=
tokenizer
(
sentences
[
1
],
return_tensors
=
"tf"
).
input_ids
output_padded
=
model
.
generate
(
input_ids
=
inputs_padded
,
max_length
=
model
.
config
.
max_length
-
num_paddings
)
batch_out_sentence
=
tokenizer
.
batch_decode
(
outputs
,
skip_special_tokens
=
True
)
non_padded_sentence
=
tokenizer
.
decode
(
output_non_padded
[
0
],
skip_special_tokens
=
True
)
padded_sentence
=
tokenizer
.
decode
(
output_padded
[
0
],
skip_special_tokens
=
True
)
expected_output_sentence
=
[
"Hello, my dog is a little bit of a shy one, but he is very friendly"
,
"Today, I am going to share with you a few of my favorite things"
,
]
self
.
assertListEqual
(
expected_output_sentence
,
batch_out_sentence
)
self
.
assertListEqual
(
expected_output_sentence
,
[
non_padded_sentence
,
padded_sentence
])
@
slow
def
test_model_from_pretrained
(
self
):
for
model_name
in
TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST
[:
1
]:
...
...
@@ -246,6 +208,8 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
tf
.
random
.
set_seed
(
0
)
tokenized
=
tokenizer
(
"Today is a nice day and"
,
return_tensors
=
"tf"
)
input_ids
=
tokenized
.
input_ids
# forces the generation to happen on CPU, to avoid GPU-related quirks (and assure same output regardless of the available devices)
with
tf
.
device
(
":/CPU:0"
):
output_ids
=
model
.
generate
(
input_ids
,
do_sample
=
True
,
seed
=
[
7
,
0
])
output_str
=
tokenizer
.
decode
(
output_ids
[
0
],
skip_special_tokens
=
True
)
...
...
@@ -255,33 +219,41 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
self
.
assertEqual
(
output_str
,
EXPECTED_OUTPUT_STR
)
@
slow
def
test_lm_generate_xglm_left_padding
(
self
):
"""Tests that the generated text is the same, regarless of left padding"""
tokenizer
=
XGLMTokenizer
.
from_pretrained
(
"facebook/xglm-564M"
)
def
test_batch_generation
(
self
):
model
=
TFXGLMForCausalLM
.
from_pretrained
(
"facebook/xglm-564M"
)
tokenizer
=
XGLMTokenizer
.
from_pretrained
(
"facebook/xglm-564M"
)
tokenizer
.
padding_side
=
"left"
generation_kwargs
=
{
"bad_words_ids"
:
[
tokenizer
(
"is"
).
input_ids
,
tokenizer
(
"angry about"
).
input_ids
],
"no_repeat_ngram_size"
:
2
,
"do_sample"
:
False
,
"repetition_penalty"
:
1.3
,
}
expected_output_string
=
(
"Today is a beautiful day and I am so glad that we have the opportunity to spend time with"
)
# use different length sentences to test batching
sentences
=
[
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When"
,
"Hello, my dog is a little"
,
]
inputs
=
tokenizer
(
sentences
,
return_tensors
=
"tf"
,
padding
=
True
)
input_ids
=
inputs
[
"input_ids"
]
outputs
=
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
inputs
[
"attention_mask"
],
max_new_tokens
=
12
)
sentences
=
[
"Today is a beautiful day and"
]
input_ids
=
tokenizer
(
sentences
,
return_tensors
=
"tf"
,
padding
=
True
)
# using default length
output_ids
=
model
.
generate
(
**
input_ids
,
**
generation_kwargs
)
output_strings
=
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)
self
.
assertEqual
(
output_strings
[
0
],
expected_output_string
)
sentences
=
[
"Today is a beautiful day and"
,
"This is a very long input that we absolutely don't care about"
]
input_ids
=
tokenizer
(
sentences
,
return_tensors
=
"tf"
,
padding
=
True
)
# longer max length to capture the full length (remember: it is left padded)
output_ids
=
model
.
generate
(
**
input_ids
,
**
generation_kwargs
,
max_length
=
28
)
output_strings
=
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)
self
.
assertEqual
(
output_strings
[
0
],
expected_output_string
)
inputs_non_padded
=
tokenizer
(
sentences
[
0
],
return_tensors
=
"tf"
).
input_ids
output_non_padded
=
model
.
generate
(
input_ids
=
inputs_non_padded
,
max_new_tokens
=
12
)
inputs_padded
=
tokenizer
(
sentences
[
1
],
return_tensors
=
"tf"
).
input_ids
output_padded
=
model
.
generate
(
input_ids
=
inputs_padded
,
max_new_tokens
=
12
)
batch_out_sentence
=
tokenizer
.
batch_decode
(
outputs
,
skip_special_tokens
=
True
)
non_padded_sentence
=
tokenizer
.
decode
(
output_non_padded
[
0
],
skip_special_tokens
=
True
)
padded_sentence
=
tokenizer
.
decode
(
output_padded
[
0
],
skip_special_tokens
=
True
)
expected_output_sentence
=
[
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
"a single"
,
"Hello, my dog is a little bit of a shy one, but he is very friendly"
,
]
self
.
assertListEqual
(
expected_output_sentence
,
batch_out_sentence
)
self
.
assertListEqual
(
expected_output_sentence
,
[
non_padded_sentence
,
padded_sentence
])
tests/models/xglm/test_modeling_xglm.py
View file @
4060d685
...
...
@@ -340,6 +340,35 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_xglm_weight_initialization
(
*
config_and_inputs
)
@
slow
def
test_model_from_pretrained
(
self
):
for
model_name
in
XGLM_PRETRAINED_MODEL_ARCHIVE_LIST
[:
1
]:
model
=
XGLMModel
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
model
)
@
require_torch
class
XGLMModelLanguageGenerationTest
(
unittest
.
TestCase
):
def
_test_lm_generate_xglm_helper
(
self
,
gradient_checkpointing
=
False
,
verify_outputs
=
True
,
):
model
=
XGLMForCausalLM
.
from_pretrained
(
"facebook/xglm-564M"
)
if
gradient_checkpointing
:
model
.
gradient_checkpointing_enable
()
else
:
model
.
gradient_checkpointing_disable
()
model
.
to
(
torch_device
)
input_ids
=
torch
.
tensor
([[
2
,
268
,
9865
]],
dtype
=
torch
.
long
,
device
=
torch_device
)
# The dog
# </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
# fmt: off
expected_output_ids
=
[
2
,
268
,
9865
,
67
,
11
,
1988
,
57252
,
9865
,
5
,
984
,
67
,
1988
,
213838
,
1658
,
53
,
70446
,
33
,
6657
,
278
,
1581
]
# fmt: on
output_ids
=
model
.
generate
(
input_ids
,
do_sample
=
False
,
num_beams
=
1
)
if
verify_outputs
:
self
.
assertListEqual
(
output_ids
[
0
].
tolist
(),
expected_output_ids
)
@
slow
def
test_batch_generation
(
self
):
model
=
XGLMForCausalLM
.
from_pretrained
(
"facebook/xglm-564M"
)
...
...
@@ -350,65 +379,39 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
# use different length sentences to test batching
sentences
=
[
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When"
,
"Hello, my dog is a little"
,
"Today, I"
,
]
inputs
=
tokenizer
(
sentences
,
return_tensors
=
"pt"
,
padding
=
True
)
input_ids
=
inputs
[
"input_ids"
].
to
(
torch_device
)
outputs
=
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
inputs
[
"attention_mask"
].
to
(
torch_device
),
input_ids
=
input_ids
,
attention_mask
=
inputs
[
"attention_mask"
].
to
(
torch_device
),
max_new_tokens
=
12
)
inputs_non_padded
=
tokenizer
(
sentences
[
0
],
return_tensors
=
"pt"
).
input_ids
.
to
(
torch_device
)
output_non_padded
=
model
.
generate
(
input_ids
=
inputs_non_padded
)
output_non_padded
=
model
.
generate
(
input_ids
=
inputs_non_padded
,
max_new_tokens
=
12
)
num_paddings
=
inputs_non_padded
.
shape
[
-
1
]
-
inputs
[
"attention_mask"
][
-
1
].
long
().
sum
().
cpu
().
item
()
inputs_padded
=
tokenizer
(
sentences
[
1
],
return_tensors
=
"pt"
).
input_ids
.
to
(
torch_device
)
output_padded
=
model
.
generate
(
input_ids
=
inputs_padded
,
max_
length
=
model
.
config
.
max_length
-
num_paddings
)
output_padded
=
model
.
generate
(
input_ids
=
inputs_padded
,
max_
new_tokens
=
12
)
batch_out_sentence
=
tokenizer
.
batch_decode
(
outputs
,
skip_special_tokens
=
True
)
non_padded_sentence
=
tokenizer
.
decode
(
output_non_padded
[
0
],
skip_special_tokens
=
True
)
padded_sentence
=
tokenizer
.
decode
(
output_padded
[
0
],
skip_special_tokens
=
True
)
expected_output_sentence
=
[
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
"a single"
,
"Hello, my dog is a little bit of a shy one, but he is very friendly"
,
"Today, I am going to share with you a few of my favorite things"
,
]
self
.
assertListEqual
(
expected_output_sentence
,
batch_out_sentence
)
self
.
assertListEqual
(
expected_output_sentence
,
[
non_padded_sentence
,
padded_sentence
])
@
slow
def
test_model_from_pretrained
(
self
):
for
model_name
in
XGLM_PRETRAINED_MODEL_ARCHIVE_LIST
[:
1
]:
model
=
XGLMModel
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
model
)
@
require_torch
class
XGLMModelLanguageGenerationTest
(
unittest
.
TestCase
):
def
_test_lm_generate_xglm_helper
(
self
,
gradient_checkpointing
=
False
,
verify_outputs
=
True
,
):
model
=
XGLMForCausalLM
.
from_pretrained
(
"facebook/xglm-564M"
)
if
gradient_checkpointing
:
model
.
gradient_checkpointing_enable
()
else
:
model
.
gradient_checkpointing_disable
()
model
.
to
(
torch_device
)
input_ids
=
torch
.
tensor
([[
2
,
268
,
9865
]],
dtype
=
torch
.
long
,
device
=
torch_device
)
# The dog
# </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
# fmt: off
expected_output_ids
=
[
2
,
268
,
9865
,
67
,
11
,
1988
,
57252
,
9865
,
5
,
984
,
67
,
1988
,
213838
,
1658
,
53
,
70446
,
33
,
6657
,
278
,
1581
]
# fmt: on
output_ids
=
model
.
generate
(
input_ids
,
do_sample
=
False
,
num_beams
=
1
)
if
verify_outputs
:
self
.
assertListEqual
(
output_ids
[
0
].
tolist
(),
expected_output_ids
)
@
slow
def
test_lm_generate_xglm
(
self
):
self
.
_test_lm_generate_xglm_helper
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment