Unverified Commit ece76244 authored by Tianqi Zhang (张天启)'s avatar Tianqi Zhang (张天启) Committed by GitHub
Browse files

Fix incorrect comments about atten mask for pytorch backend (#18728)



* fix incorrect comments about atten mask

* typo

* Update for CodeGen
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 0cea8d55
...@@ -756,7 +756,7 @@ class ModuleUtilsMixin: ...@@ -756,7 +756,7 @@ class ModuleUtilsMixin:
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility
......
...@@ -466,7 +466,7 @@ class CanineSelfAttention(nn.Module): ...@@ -466,7 +466,7 @@ class CanineSelfAttention(nn.Module):
attention_mask = torch.unsqueeze(attention_mask, dim=1) attention_mask = torch.unsqueeze(attention_mask, dim=1)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
attention_mask = (1.0 - attention_mask.float()) * torch.finfo(attention_scores.dtype).min attention_mask = (1.0 - attention_mask.float()) * torch.finfo(attention_scores.dtype).min
# Apply the attention mask (precomputed for all layers in CanineModel forward() function) # Apply the attention mask (precomputed for all layers in CanineModel forward() function)
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
......
...@@ -518,11 +518,11 @@ class CodeGenModel(CodeGenPreTrainedModel): ...@@ -518,11 +518,11 @@ class CodeGenModel(CodeGenPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0 attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
......
...@@ -431,7 +431,7 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -431,7 +431,7 @@ class CTRLModel(CTRLPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -571,7 +571,7 @@ class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel): ...@@ -571,7 +571,7 @@ class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -805,7 +805,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -805,7 +805,7 @@ class GPT2Model(GPT2PreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -565,7 +565,7 @@ class GPTNeoModel(GPTNeoPreTrainedModel): ...@@ -565,7 +565,7 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -484,7 +484,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel): ...@@ -484,7 +484,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -606,7 +606,7 @@ class GPTJModel(GPTJPreTrainedModel): ...@@ -606,7 +606,7 @@ class GPTJModel(GPTJPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -757,7 +757,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel): ...@@ -757,7 +757,7 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
......
...@@ -955,7 +955,7 @@ class LxmertModel(LxmertPreTrainedModel): ...@@ -955,7 +955,7 @@ class LxmertModel(LxmertPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
......
...@@ -475,7 +475,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -475,7 +475,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for # masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment