Add retry for flaky tests in CI (#4755)

15ddd843 · fzyzcjy · GitHub · 52029bd1 · 15ddd843 · 15ddd843
Unverified Commit 15ddd843 authored Mar 26, 2025 by fzyzcjy Committed by GitHub Mar 25, 2025
20 changed files
--- a/test/srt/models/test_embedding_models.py
+++ b/test/srt/models/test_embedding_models.py
@@ -20,7 +20,7 @@ import torch
 from transformers import AutoConfig, AutoTokenizer

 from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
-from sglang.test.test_utils import get_similarities, is_in_ci
+from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci

 MODELS = [
    ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5),
@@ -31,7 +31,7 @@ MODELS = [
 TORCH_DTYPES = [torch.float16]


-class TestEmbeddingModels(unittest.TestCase):
+class TestEmbeddingModels(CustomTestCase):

    @classmethod
    def setUpClass(cls):

--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -33,7 +33,7 @@ from sglang.test.runners import (
    SRTRunner,
    check_close_model_outputs,
 )
-from sglang.test.test_utils import is_in_ci
+from sglang.test.test_utils import CustomTestCase, is_in_ci


 @dataclasses.dataclass
@@ -71,7 +71,7 @@ ALL_OTHER_MODELS = [
 TORCH_DTYPES = [torch.float16]


-class TestGenerationModels(unittest.TestCase):
+class TestGenerationModels(CustomTestCase):

    @classmethod
    def setUpClass(cls):

--- a/test/srt/models/test_gme_qwen_models.py
+++ b/test/srt/models/test_gme_qwen_models.py
@@ -19,7 +19,7 @@ import unittest
 import torch

 from sglang.test.runners import HFRunner, SRTRunner
-from sglang.test.test_utils import get_similarities
+from sglang.test.test_utils import CustomTestCase, get_similarities

 TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
 IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
@@ -31,7 +31,7 @@ MODELS = [
 TORCH_DTYPES = [torch.float16]


-class TestQmeQwenModels(unittest.TestCase):
+class TestQmeQwenModels(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        mp.set_start_method("spawn", force=True)

--- a/test/srt/models/test_grok_models.py
+++ b/test/srt/models/test_grok_models.py
@@ -6,11 +6,12 @@ from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestGrok(unittest.TestCase):
+class TestGrok(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "lmzheng/grok-1"

--- a/test/srt/models/test_qwen_models.py
+++ b/test/srt/models/test_qwen_models.py
@@ -6,11 +6,12 @@ from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestQwen2(unittest.TestCase):
+class TestQwen2(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2-7B-Instruct"
@@ -41,7 +42,7 @@ class TestQwen2(unittest.TestCase):
        self.assertGreater(metrics["accuracy"], 0.78)


-class TestQwen2FP8(unittest.TestCase):
+class TestQwen2FP8(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "neuralmagic/Qwen2-7B-Instruct-FP8"

--- a/test/srt/models/test_reward_models.py
+++ b/test/srt/models/test_reward_models.py
@@ -18,6 +18,7 @@ import unittest
 import torch

 from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase

 MODELS = [
    ("LxzGordon/URM-LLaMa-3.1-8B", 1, 4e-2),
@@ -41,7 +42,7 @@ CONVS = [
 ]


-class TestRewardModels(unittest.TestCase):
+class TestRewardModels(CustomTestCase):

    @classmethod
    def setUpClass(cls):

--- a/test/srt/test_abort.py
+++ b/test/srt/test_abort.py
@@ -5,10 +5,10 @@ from concurrent.futures import ThreadPoolExecutor

 import requests

-from sglang.test.test_utils import run_and_check_memory_leak
+from sglang.test.test_utils import CustomTestCase, run_and_check_memory_leak


-class TestAbort(unittest.TestCase):
+class TestAbort(CustomTestCase):
    def workload_func(self, base_url, model):
        def process_func():
            def run_one(_):

--- a/test/srt/test_awq.py
+++ b/test/srt/test_awq.py
@@ -7,11 +7,12 @@ from sglang.test.test_utils import (
    DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestAWQ(unittest.TestCase):
+class TestAWQ(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST

--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -3,6 +3,7 @@ import unittest
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
    get_bool_env_var,
    is_in_ci,
    run_bench_one_batch,
@@ -10,7 +11,7 @@ from sglang.test.test_utils import (
 )


-class TestBenchOneBatch(unittest.TestCase):
+class TestBenchOneBatch(CustomTestCase):
    def test_bs1(self):
        output_throughput = run_bench_one_batch(
            DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]

--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -6,13 +6,14 @@ from sglang.test.test_utils import (
    DEFAULT_FP8_MODEL_NAME_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
    is_in_ci,
    run_bench_serving,
    write_github_step_summary,
 )


-class TestBenchServing(unittest.TestCase):
+class TestBenchServing(CustomTestCase):

    def test_offline_throughput_default(self):
        res = run_bench_serving(

--- a/test/srt/test_block_int8.py
+++ b/test/srt/test_block_int8.py
@@ -5,6 +5,7 @@ import torch

 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.test.test_utils import CustomTestCase


 # For test
@@ -121,7 +122,7 @@ def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
    ).sum(dim=1)


-class TestW8A8BlockINT8FusedMoE(unittest.TestCase):
+class TestW8A8BlockINT8FusedMoE(CustomTestCase):
    DTYPES = [torch.half, torch.bfloat16]
    M = [1, 33, 64, 222]
    N = [128, 1024]

--- a/test/srt/test_cache_report.py
+++ b/test/srt/test_cache_report.py
@@ -8,11 +8,12 @@ from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestCacheReport(unittest.TestCase):
+class TestCacheReport(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST

--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -4,10 +4,10 @@ python3 -m unittest test_chunked_prefill.TestChunkedPrefill.test_mixed_chunked_p

 import unittest

-from sglang.test.test_utils import run_mmlu_test, run_mulit_request_test
+from sglang.test.test_utils import CustomTestCase, run_mmlu_test, run_mulit_request_test


-class TestChunkedPrefill(unittest.TestCase):
+class TestChunkedPrefill(CustomTestCase):
    def test_chunked_prefill(self):
        run_mmlu_test(disable_radix_cache=False, enable_mixed_chunk=False)


--- a/test/srt/test_create_kvindices.py
+++ b/test/srt/test_create_kvindices.py
@@ -5,9 +5,10 @@ import numpy as np
 import torch

 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.test.test_utils import CustomTestCase


-class TestCreateKvIndices(unittest.TestCase):
+class TestCreateKvIndices(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        if not torch.cuda.is_available():

--- a/test/srt/test_custom_allreduce.py
+++ b/test/srt/test_custom_allreduce.py
@@ -17,6 +17,7 @@ from sglang.srt.distributed.parallel_state import (
    graph_capture,
    initialize_model_parallel,
 )
+from sglang.test.test_utils import CustomTestCase


 def get_open_port() -> int:
@@ -54,7 +55,7 @@ def multi_process_parallel(
    ray.shutdown()


-class TestCustomAllReduce(unittest.TestCase):
+class TestCustomAllReduce(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        random.seed(42)

--- a/test/srt/test_data_parallelism.py
+++ b/test/srt/test_data_parallelism.py
@@ -10,11 +10,12 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestDataParallelism(unittest.TestCase):
+class TestDataParallelism(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST

--- a/test/srt/test_double_sparsity.py
+++ b/test/srt/test_double_sparsity.py
@@ -8,11 +8,12 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestDoubleSparsity(unittest.TestCase):
+class TestDoubleSparsity(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST

--- a/test/srt/test_dp_attention.py
+++ b/test/srt/test_dp_attention.py
@@ -7,11 +7,12 @@ from sglang.test.test_utils import (
    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestDPAttentionDP2TP2(unittest.TestCase):
+class TestDPAttentionDP2TP2(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST

--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -24,6 +24,7 @@ from sglang.test.test_utils import (
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
    run_logprob_check,
 )
@@ -33,7 +34,7 @@ prefill_tolerance = 5e-2
 decode_tolerance: float = 5e-2


-class TestEAGLEEngine(unittest.TestCase):
+class TestEAGLEEngine(CustomTestCase):
    BASE_CONFIG = {
        "model_path": DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
        "speculative_draft_model_path": DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
@@ -179,7 +180,7 @@ class TestEAGLE3Engine(TestEAGLEEngine):
    NUM_CONFIGS = 1


-class TestEAGLEServer(unittest.TestCase):
+class TestEAGLEServer(CustomTestCase):
    PROMPTS = [
        "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
        '[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]',

--- a/test/srt/test_ebnf_constrained.py
+++ b/test/srt/test_ebnf_constrained.py
@@ -15,6 +15,7 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )

@@ -42,7 +43,7 @@ def setup_class(cls, backend: str, disable_overlap: bool):
    )


-class TestEBNFConstrained(unittest.TestCase):
+class TestEBNFConstrained(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        setup_class(cls, "xgrammar", disable_overlap=False)