Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
d9deddb4
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "17c634fd5b6c24396faf780b9b069beca1289d84"
Unverified
Commit
d9deddb4
authored
Feb 07, 2024
by
Sourab Mangrulkar
Committed by
GitHub
Feb 07, 2024
Browse files
fix Starcoder FA2 implementation (#28891)
parent
64d1518c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1 addition
and
8 deletions
+1
-8
src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+1
-8
No files found.
src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
View file @
d9deddb4
...
@@ -363,13 +363,6 @@ class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
...
@@ -363,13 +363,6 @@ class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
attn_dropout
=
self
.
attn_pdrop
if
self
.
training
else
0.0
attn_dropout
=
self
.
attn_pdrop
if
self
.
training
else
0.0
softmax_dtype
=
torch
.
float32
if
self
.
attention_softmax_in_fp32
else
query
.
dtype
upcast
=
query
.
dtype
!=
softmax_dtype
softmax_scale
=
self
.
layer_idx
+
1
if
self
.
scale_attention_softmax_in_fp32
and
upcast
else
1
softmax_scale
=
softmax_scale
**-
1
if
self
.
scale_attn_weights
:
softmax_scale
/=
self
.
head_dim
**
0.5
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in float16 just to be sure everything works as expected.
# cast them back in float16 just to be sure everything works as expected.
...
@@ -393,7 +386,7 @@ class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
...
@@ -393,7 +386,7 @@ class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
value
=
value
.
to
(
target_dtype
)
value
=
value
.
to
(
target_dtype
)
attn_output
=
self
.
_flash_attention_forward
(
attn_output
=
self
.
_flash_attention_forward
(
query
,
key
,
value
,
attention_mask
,
query_length
,
dropout
=
attn_dropout
,
softmax_scale
=
softmax_scale
query
,
key
,
value
,
attention_mask
,
query_length
,
dropout
=
attn_dropout
)
)
attn_weights_reshaped
=
attn_output
.
reshape
(
batch_size
,
query_length
,
self
.
num_heads
*
self
.
head_dim
)
attn_weights_reshaped
=
attn_output
.
reshape
(
batch_size
,
query_length
,
self
.
num_heads
*
self
.
head_dim
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment