Unverified Commit 4060d685 authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

XGLM: Fix left-padding (PT and TF) (#22828)

parent 474bf508
...@@ -124,18 +124,6 @@ def create_sinusoidal_positions(n_pos, dim, padding_idx=1): ...@@ -124,18 +124,6 @@ def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
return jnp.array(emb) return jnp.array(emb)
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
"""
Shift input ids one token to the right.
"""
shifted_input_ids = jnp.roll(input_ids, 1, axis=-1)
shifted_input_ids = shifted_input_ids.at[(..., 0)].set(decoder_start_token_id)
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
return shifted_input_ids
class FlaxXGLMAttention(nn.Module): class FlaxXGLMAttention(nn.Module):
config: XGLMConfig config: XGLMConfig
embed_dim: int embed_dim: int
......
...@@ -476,19 +476,8 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): ...@@ -476,19 +476,8 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
return combined_attention_mask return combined_attention_mask
def embed_positions( def embed_positions(self, position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None) -> tf.Tensor:
self, position_ids += self.offset
input_ids: Optional[TFModelInputType] = None,
inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
past_key_values_length: Optional[int] = None,
) -> tf.Tensor:
if input_ids is not None:
position_ids = _create_position_ids_from_input_ids(input_ids, past_key_values_length, self.padding_idx)
else:
position_ids = _create_position_ids_from_inputs_embeds(
inputs_embeds, past_key_values_length, self.padding_idx
)
positions = tf.gather(self._embed_positions_weights, position_ids, axis=0) positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
return positions return positions
...@@ -497,6 +486,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): ...@@ -497,6 +486,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
self, self,
input_ids: Optional[TFModelInputType] = None, input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
...@@ -528,9 +518,14 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): ...@@ -528,9 +518,14 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
else: else:
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if position_ids is None:
position_ids = tf.expand_dims(
tf.range(past_key_values_length, input_shape[-1] + past_key_values_length), axis=0
)
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
if inputs_embeds is None: if inputs_embeds is None:
# Note: tf.gather, on which the embedding layer is based, won't check positive out of bound # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
# indices on GPU, returning zeros instead. This is a dangerous silent behavior. # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
...@@ -552,7 +547,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): ...@@ -552,7 +547,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
# embed positions # embed positions
positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) positions = self.embed_positions(position_ids)
hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions
...@@ -713,6 +708,11 @@ XGLM_INPUTS_DOCSTRING = r""" ...@@ -713,6 +708,11 @@ XGLM_INPUTS_DOCSTRING = r"""
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask) [What are attention masks?](../glossary#attention-mask)
position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder. the decoder.
...@@ -796,6 +796,7 @@ class TFXGLMModel(TFXGLMPreTrainedModel): ...@@ -796,6 +796,7 @@ class TFXGLMModel(TFXGLMPreTrainedModel):
self, self,
input_ids: Optional[TFModelInputType] = None, input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
...@@ -876,9 +877,6 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -876,9 +877,6 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
name="lm_head", name="lm_head",
) )
# TODO (Joao): investigate why XGLM has numerical issues in XLA generate
self.supports_xla_generation = False
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.lm_head
...@@ -890,11 +888,18 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -890,11 +888,18 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
if past_key_values: if past_key_values:
inputs = tf.expand_dims(inputs[:, -1], -1) inputs = tf.expand_dims(inputs[:, -1], -1)
position_ids = kwargs.get("position_ids", None)
attention_mask = kwargs.get("attention_mask", None) attention_mask = kwargs.get("attention_mask", None)
if attention_mask is not None and position_ids is None:
position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
if past_key_values:
position_ids = tf.expand_dims(position_ids[:, -1], -1)
return { return {
"input_ids": inputs, "input_ids": inputs,
"attention_mask": attention_mask, "attention_mask": attention_mask,
"position_ids": position_ids,
"past_key_values": past_key_values, "past_key_values": past_key_values,
"use_cache": use_cache, "use_cache": use_cache,
} }
...@@ -911,6 +916,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -911,6 +916,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
self, self,
input_ids: Optional[TFModelInputType] = None, input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
...@@ -935,6 +941,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -935,6 +941,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
outputs = self.model( outputs = self.model(
input_ids=input_ids, input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
position_ids=position_ids,
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask, head_mask=head_mask,
......
...@@ -75,11 +75,34 @@ XGLM_INPUTS_DOCSTRING = r""" ...@@ -75,11 +75,34 @@ XGLM_INPUTS_DOCSTRING = r"""
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask) [What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**. - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
...@@ -88,20 +111,12 @@ XGLM_INPUTS_DOCSTRING = r""" ...@@ -88,20 +111,12 @@ XGLM_INPUTS_DOCSTRING = r"""
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
directly pass an embedded representation. This is useful if you want more control over how to convert can choose to directly pass an embedded representation. This is useful if you want more control over how to
`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
`past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
`past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
associated vectors than the model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*): output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. tensors for more detail.
...@@ -146,18 +161,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] ...@@ -146,18 +161,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
"""
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->XGLM
class XGLMSinusoidalPositionalEmbedding(nn.Module): class XGLMSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length.""" """This module produces sinusoidal positional embeddings of any length."""
...@@ -198,43 +201,17 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module): ...@@ -198,43 +201,17 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module):
return emb.to(torch.get_default_dtype()) return emb.to(torch.get_default_dtype())
@torch.no_grad() @torch.no_grad()
def forward( def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0):
self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0 bsz, seq_len = position_ids.size()
): position_ids += self.offset
if input_ids is not None:
bsz, seq_len = input_ids.size()
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
else:
bsz, seq_len = inputs_embeds.size()[:-1]
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
# expand embeddings if needed # Expand embeddings if needed. `position_ids.max()` is NOT used to keep torch.fx compatibility.
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length max_pos = 2 + seq_len + past_key_values_length
if max_pos > self.weights.size(0): if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) self.make_weights(max_pos, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach() return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
class XGLMAttention(nn.Module): class XGLMAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
...@@ -605,6 +582,7 @@ class XGLMModel(XGLMPreTrainedModel): ...@@ -605,6 +582,7 @@ class XGLMModel(XGLMPreTrainedModel):
self, self,
input_ids: Optional[torch.Tensor] = None, input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None,
...@@ -616,70 +594,6 @@ class XGLMModel(XGLMPreTrainedModel): ...@@ -616,70 +594,6 @@ class XGLMModel(XGLMPreTrainedModel):
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
control over how to convert `input_ids` indices into associated vectors than the model's internal
embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -698,9 +612,19 @@ class XGLMModel(XGLMPreTrainedModel): ...@@ -698,9 +612,19 @@ class XGLMModel(XGLMPreTrainedModel):
else: else:
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if position_ids is None:
position_ids = torch.arange(
past_key_values_length,
input_shape[-1] + past_key_values_length,
dtype=torch.long,
device=input_ids.device if input_ids is not None else inputs_embeds.device,
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
else:
position_ids = position_ids.view(-1, input_shape[-1])
if inputs_embeds is None: if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
...@@ -713,11 +637,7 @@ class XGLMModel(XGLMPreTrainedModel): ...@@ -713,11 +637,7 @@ class XGLMModel(XGLMPreTrainedModel):
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
# embed positions hidden_states = inputs_embeds + self.embed_positions(position_ids, past_key_values_length)
positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
hidden_states = inputs_embeds + positions
hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training) hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training)
if self.gradient_checkpointing and self.training: if self.gradient_checkpointing and self.training:
...@@ -866,6 +786,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel): ...@@ -866,6 +786,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
self, self,
input_ids: Optional[torch.Tensor] = None, input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None,
...@@ -895,6 +816,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel): ...@@ -895,6 +816,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
outputs = self.model( outputs = self.model(
input_ids=input_ids, input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
position_ids=position_ids,
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask, head_mask=head_mask,
...@@ -935,6 +857,15 @@ class XGLMForCausalLM(XGLMPreTrainedModel): ...@@ -935,6 +857,15 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
def prepare_inputs_for_generation( def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
): ):
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
else:
position_ids = None
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
if attention_mask is None: if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape) attention_mask = input_ids.new_ones(input_ids.shape)
...@@ -945,6 +876,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel): ...@@ -945,6 +876,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
return { return {
"input_ids": input_ids, # encoder_outputs is defined. input_ids not needed "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
"attention_mask": attention_mask, "attention_mask": attention_mask,
"position_ids": position_ids,
"past_key_values": past_key_values, "past_key_values": past_key_values,
"use_cache": use_cache, "use_cache": use_cache,
} }
......
...@@ -175,44 +175,6 @@ class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -175,44 +175,6 @@ class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
name = model.get_bias() name = model.get_bias()
assert name is None assert name is None
@slow
def test_batch_generation(self):
model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
tokenizer.padding_side = "left"
# use different length sentences to test batching
sentences = [
"Hello, my dog is a little",
"Today, I",
]
inputs = tokenizer(sentences, return_tensors="tf", padding=True)
outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
output_non_padded = model.generate(input_ids=inputs_non_padded)
num_paddings = (
inputs_non_padded.shape[-1]
- tf.math.reduce_sum(tf.cast(inputs["attention_mask"][-1], dtype=tf.int64)).numpy()
)
inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
expected_output_sentence = [
"Hello, my dog is a little bit of a shy one, but he is very friendly",
"Today, I am going to share with you a few of my favorite things",
]
self.assertListEqual(expected_output_sentence, batch_out_sentence)
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
...@@ -246,6 +208,8 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase): ...@@ -246,6 +208,8 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
tf.random.set_seed(0) tf.random.set_seed(0)
tokenized = tokenizer("Today is a nice day and", return_tensors="tf") tokenized = tokenizer("Today is a nice day and", return_tensors="tf")
input_ids = tokenized.input_ids input_ids = tokenized.input_ids
# forces the generation to happen on CPU, to avoid GPU-related quirks (and assure same output regardless of the available devices)
with tf.device(":/CPU:0"):
output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0]) output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
...@@ -255,33 +219,41 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase): ...@@ -255,33 +219,41 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
self.assertEqual(output_str, EXPECTED_OUTPUT_STR) self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
@slow @slow
def test_lm_generate_xglm_left_padding(self): def test_batch_generation(self):
"""Tests that the generated text is the same, regarless of left padding"""
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
generation_kwargs = { # use different length sentences to test batching
"bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], sentences = [
"no_repeat_ngram_size": 2, "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"do_sample": False, "left-padding, such as in batched generation. The output for the sequence below should be the same "
"repetition_penalty": 1.3, "regardless of whether left padding is applied or not. When",
} "Hello, my dog is a little",
expected_output_string = ( ]
"Today is a beautiful day and I am so glad that we have the opportunity to spend time with"
) inputs = tokenizer(sentences, return_tensors="tf", padding=True)
input_ids = inputs["input_ids"]
outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], max_new_tokens=12)
sentences = ["Today is a beautiful day and"] inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
input_ids = tokenizer(sentences, return_tensors="tf", padding=True) output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
# using default length
output_ids = model.generate(**input_ids, **generation_kwargs) inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
self.assertEqual(output_strings[0], expected_output_string)
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"] non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
input_ids = tokenizer(sentences, return_tensors="tf", padding=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
# longer max length to capture the full length (remember: it is left padded)
output_ids = model.generate(**input_ids, **generation_kwargs, max_length=28) expected_output_sentence = [
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
self.assertEqual(output_strings[0], expected_output_string) "left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
"a single",
"Hello, my dog is a little bit of a shy one, but he is very friendly",
]
self.assertListEqual(expected_output_sentence, batch_out_sentence)
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
...@@ -340,6 +340,35 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin ...@@ -340,6 +340,35 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs) self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = XGLMModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch
class XGLMModelLanguageGenerationTest(unittest.TestCase):
def _test_lm_generate_xglm_helper(
self,
gradient_checkpointing=False,
verify_outputs=True,
):
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
if gradient_checkpointing:
model.gradient_checkpointing_enable()
else:
model.gradient_checkpointing_disable()
model.to(torch_device)
input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device) # The dog
# </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
# fmt: off
expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]
# fmt: on
output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
if verify_outputs:
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@slow @slow
def test_batch_generation(self): def test_batch_generation(self):
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M") model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
...@@ -350,65 +379,39 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin ...@@ -350,65 +379,39 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
# use different length sentences to test batching # use different length sentences to test batching
sentences = [ sentences = [
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When",
"Hello, my dog is a little", "Hello, my dog is a little",
"Today, I",
] ]
inputs = tokenizer(sentences, return_tensors="pt", padding=True) inputs = tokenizer(sentences, return_tensors="pt", padding=True)
input_ids = inputs["input_ids"].to(torch_device) input_ids = inputs["input_ids"].to(torch_device)
outputs = model.generate( outputs = model.generate(
input_ids=input_ids, input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), max_new_tokens=12
attention_mask=inputs["attention_mask"].to(torch_device),
) )
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded) output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
expected_output_sentence = [ expected_output_sentence = [
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
"a single",
"Hello, my dog is a little bit of a shy one, but he is very friendly", "Hello, my dog is a little bit of a shy one, but he is very friendly",
"Today, I am going to share with you a few of my favorite things",
] ]
self.assertListEqual(expected_output_sentence, batch_out_sentence) self.assertListEqual(expected_output_sentence, batch_out_sentence)
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
@slow
def test_model_from_pretrained(self):
for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = XGLMModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch
class XGLMModelLanguageGenerationTest(unittest.TestCase):
def _test_lm_generate_xglm_helper(
self,
gradient_checkpointing=False,
verify_outputs=True,
):
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
if gradient_checkpointing:
model.gradient_checkpointing_enable()
else:
model.gradient_checkpointing_disable()
model.to(torch_device)
input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device) # The dog
# </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
# fmt: off
expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]
# fmt: on
output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
if verify_outputs:
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@slow @slow
def test_lm_generate_xglm(self): def test_lm_generate_xglm(self):
self._test_lm_generate_xglm_helper() self._test_lm_generate_xglm_helper()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment