add the missing flash attention test marker (#32419)

* add flash attention check * fix * fix * add the missing marker * bug fix * add one more * remove order * add one more

add the missing flash attention test marker (#32419)
* add flash attention check * fix * fix * add the missing marker * bug fix * add one more * remove order * add one more
e85d8639 · Fanli Lin · GitHub · 0aa83282 · e85d8639 · e85d8639
Unverified Commit e85d8639 authored Aug 06, 2024 by Fanli Lin Committed by GitHub Aug 06, 2024
7 changed files
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -628,9 +628,9 @@ class GemmaIntegrationTest(unittest.TestCase):
        self.assertEqual(output_text, EXPECTED_TEXTS)
-    @pytest.mark.flash_attn_test
    @require_flash_attn
    @require_read_token
+    @pytest.mark.flash_attn_test
    def test_model_2b_flash_attn(self):
        model_id = "google/gemma-2b"
        EXPECTED_TEXTS = [

--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -620,6 +620,7 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
    @require_flash_attn
    @require_torch_gpu
    @slow
+    @pytest.mark.flash_attn_test
    def test_use_flash_attention_2_true(self):
        """
        NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.

--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -576,9 +576,10 @@ class MistralIntegrationTest(unittest.TestCase):
        backend_empty_cache(torch_device)
        gc.collect()
+    @require_flash_attn
    @require_bitsandbytes
    @slow
-    @require_flash_attn
+    @pytest.mark.flash_attn_test
    def test_model_7b_long_prompt(self):
        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
        # An input with 4097 tokens that is above the size of the sliding window

--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -544,6 +544,7 @@ class Qwen2IntegrationTest(unittest.TestCase):
    @require_bitsandbytes
    @slow
    @require_flash_attn
+    @pytest.mark.flash_attn_test
    def test_model_450m_long_prompt(self):
        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
        # An input with 4097 tokens that is above the size of the sliding window

--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -606,6 +606,7 @@ class Qwen2MoeIntegrationTest(unittest.TestCase):
    @require_bitsandbytes
    @slow
    @require_flash_attn
+    @pytest.mark.flash_attn_test
    def test_model_a2_7b_long_prompt(self):
        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
        # An input with 4097 tokens that is above the size of the sliding window

--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -16,6 +16,7 @@
 import unittest
+import pytest
 from parameterized import parameterized
 from transformers import StableLmConfig, is_torch_available, set_seed
@@ -539,6 +540,7 @@ class StableLmModelIntegrationTest(unittest.TestCase):
    @require_bitsandbytes
    @slow
    @require_flash_attn
+    @pytest.mark.flash_attn_test
    def test_model_3b_long_prompt(self):
        EXPECTED_OUTPUT_TOKEN_IDS = [3, 3, 3]
        input_ids = [306, 338] * 2047

--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -528,6 +528,7 @@ class Starcoder2IntegrationTest(unittest.TestCase):
        self.assertEqual(EXPECTED_TEXT, output_text)
    @require_flash_attn
+    @pytest.mark.flash_attn_test
    def test_starcoder2_batched_generation_fa2(self):
        EXPECTED_TEXT = [
            "Hello my name is Younes and I am a student at the University of Liverpool. I am currently studying for my MSc in Computer Science. I am interested in the field of Machine Learning and I am currently working on",