[GPT] Enable FlashAttention for GPT-J

b3177dfa · Tri Dao · 6fc1e07d · b3177dfa · b3177dfa
Commit b3177dfa authored Jul 21, 2023 by Tri Dao
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

flash_attn/modules/block.py flash_attn/modules/block.py +3 -0

tests/models/test_gptj.py tests/models/test_gptj.py +2 -2

No files found.
--- a/flash_attn/modules/block.py
+++ b/flash_attn/modules/block.py
@@ -276,6 +276,9 @@ class ParallelBlock(nn.Module):
                for p in self.norm2.parameters():
                    p._shared_params = True
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
    def forward(self, hidden_states1: Tensor, hidden_states2: Optional[Tensor] = None,
                residual: Optional[Tensor] = None, mixer_kwargs=None):
        r"""Pass the input through the encoder layer.

--- a/tests/models/test_gptj.py
+++ b/tests/models/test_gptj.py
@@ -36,7 +36,7 @@ def test_gptj_optimized(model_name):
    dtype = torch.float16
    device = 'cuda'
    config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
-    config.use_flash_attn = False  # FlashAttention doesn't support hdim 256 yet
+    config.use_flash_attn = True  # FlashAttention-2 supports headdim 256
    config.fused_bias_fc = True
    config.fused_mlp = True
    config.fused_dropout_add_ln = True
@@ -93,7 +93,7 @@ def test_gptj_generation(model_name):
    dtype = torch.float16
    device = 'cuda'
    config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
-    config.use_flash_attn = False  # FlashAttention doesn't support hdim 256 yet
+    config.use_flash_attn = True  # FlashAttention-2 supports headdim 256
    config.fused_bias_fc = True
    config.fused_mlp = True
    config.fused_dropout_add_ln = True