Add retry for flaky tests in CI (#4755)

15ddd843 · fzyzcjy · GitHub · 52029bd1 · 15ddd843 · 15ddd843
Unverified Commit 15ddd843 authored Mar 26, 2025 by fzyzcjy Committed by GitHub Mar 25, 2025
20 changed files
--- a/test/srt/test_embedding_openai_server.py
+++ b/test/srt/test_embedding_openai_server.py
@@ -7,11 +7,12 @@ from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestOpenAIServer(unittest.TestCase):
+class TestOpenAIServer(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "intfloat/e5-mistral-7b-instruct"

--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -12,13 +12,14 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    is_in_ci,
    popen_launch_server,
    write_github_step_summary,
 )


-class TestEvalAccuracyLarge(unittest.TestCase):
+class TestEvalAccuracyLarge(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST

--- a/test/srt/test_eval_fp8_accuracy.py
+++ b/test/srt/test_eval_fp8_accuracy.py
@@ -13,11 +13,12 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestEvalFP8Accuracy(unittest.TestCase):
+class TestEvalFP8Accuracy(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST
@@ -44,7 +45,7 @@ class TestEvalFP8Accuracy(unittest.TestCase):
        self.assertGreaterEqual(metrics["score"], 0.61)


-class TestEvalFP8DynamicQuantAccuracy(unittest.TestCase):
+class TestEvalFP8DynamicQuantAccuracy(CustomTestCase):

    def _run_test(self, model, other_args, expected_score):
        base_url = DEFAULT_URL_FOR_TEST
@@ -109,7 +110,7 @@ class TestEvalFP8DynamicQuantAccuracy(unittest.TestCase):
        )


-class TestEvalFP8ModelOptQuantAccuracy(unittest.TestCase):
+class TestEvalFP8ModelOptQuantAccuracy(CustomTestCase):

    def _run_test(self, model, other_args, expected_score):
        base_url = DEFAULT_URL_FOR_TEST

--- a/test/srt/test_expert_distribution.py
+++ b/test/srt/test_expert_distribution.py
@@ -10,11 +10,12 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestExpertDistribution(unittest.TestCase):
+class TestExpertDistribution(CustomTestCase):
    def setUp(self):
        # Clean up any existing expert distribution files before each test
        for f in glob.glob("expert_distribution_*.csv"):

--- a/test/srt/test_fim_completion.py
+++ b/test/srt/test_fim_completion.py
@@ -7,11 +7,12 @@ from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestFimCompletion(unittest.TestCase):
+class TestFimCompletion(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "deepseek-ai/deepseek-coder-1.3b-base"

--- a/test/srt/test_fp8_kernel.py
+++ b/test/srt/test_fp8_kernel.py
@@ -6,9 +6,10 @@ from sglang.srt.layers.quantization.fp8_kernel import (
    per_token_group_quant_fp8,
    w8a8_block_fp8_matmul,
 )
+from sglang.test.test_utils import CustomTestCase


-class TestFP8Base(unittest.TestCase):
+class TestFP8Base(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.M = 256

--- a/test/srt/test_fp8_kvcache.py
+++ b/test/srt/test_fp8_kvcache.py
@@ -9,11 +9,12 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestFp8KvcacheBase(unittest.TestCase):
+class TestFp8KvcacheBase(CustomTestCase):
    model_config = None

    @classmethod

--- a/test/srt/test_function_calling.py
+++ b/test/srt/test_function_calling.py
@@ -10,11 +10,12 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestOpenAIServerFunctionCalling(unittest.TestCase):
+class TestOpenAIServerFunctionCalling(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        # Replace with the model name needed for testing; if not required, reuse DEFAULT_SMALL_MODEL_NAME_FOR_TEST

--- a/test/srt/test_fused_moe.py
+++ b/test/srt/test_fused_moe.py
@@ -7,9 +7,10 @@ from vllm.model_executor.layers.fused_moe import fused_moe as fused_moe_vllm

 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.test.test_utils import CustomTestCase


-class TestFusedMOE(unittest.TestCase):
+class TestFusedMOE(CustomTestCase):
    NUM_EXPERTS = [8, 64]
    TOP_KS = [2, 6]


--- a/test/srt/test_get_weights_by_name.py
+++ b/test/srt/test_get_weights_by_name.py
@@ -12,6 +12,7 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    is_in_ci,
    popen_launch_server,
 )
@@ -26,7 +27,7 @@ def _process_return(ret):
    return np.array(ret)


-class TestGetWeightsByName(unittest.TestCase):
+class TestGetWeightsByName(CustomTestCase):

    def init_hf_model(self, model_name, tie_word_embeddings):
        self.hf_model = AutoModelForCausalLM.from_pretrained(

--- a/test/srt/test_gguf.py
+++ b/test/srt/test_gguf.py
@@ -3,9 +3,10 @@ import unittest
 from huggingface_hub import hf_hub_download

 import sglang as sgl
+from sglang.test.test_utils import CustomTestCase


-class TestGGUF(unittest.TestCase):
+class TestGGUF(CustomTestCase):
    def test_models(self):
        prompt = "Today is a sunny day and I like"
        sampling_params = {"temperature": 0, "max_new_tokens": 8}

--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )

@@ -102,7 +103,7 @@ def check_quant_method(model_path: str, use_marlin_kernel: bool):
 # GPTQ with Dynamic Per/Module Quantization Control
 # Leverages GPTQModel (pypi) to produce the `dynamic` models
 # Test GPTQ fallback kernel that is not Marlin
-class TestGPTQModelDynamic(unittest.TestCase):
+class TestGPTQModelDynamic(CustomTestCase):
    MODEL_PATH = (
        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
    )
@@ -157,7 +158,7 @@ class TestGPTQModelDynamic(unittest.TestCase):
 # GPTQ with Dynamic Per/Module Quantization Control
 # Leverages GPTQModel (pypi) to produce the `dynamic` models
 # Test Marlin kernel
-class TestGPTQModelDynamicWithMarlin(unittest.TestCase):
+class TestGPTQModelDynamicWithMarlin(CustomTestCase):
    MODEL_PATH = (
        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
    )

--- a/test/srt/test_health_check.py
+++ b/test/srt/test_health_check.py
@@ -3,11 +3,12 @@ import unittest
 from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestHealthCheck(unittest.TestCase):
+class TestHealthCheck(CustomTestCase):
    def test_health_check(self):
        """Test that metrics endpoint returns data when enabled"""
        with self.assertRaises(TimeoutError):

--- a/test/srt/test_hicache.py
+++ b/test/srt/test_hicache.py
@@ -7,11 +7,12 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestPageSize(unittest.TestCase):
+class TestPageSize(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST

--- a/test/srt/test_hicache_mla.py
+++ b/test/srt/test_hicache_mla.py
@@ -7,11 +7,12 @@ from sglang.test.test_utils import (
    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestHierarchicalMLA(unittest.TestCase):
+class TestHierarchicalMLA(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST

--- a/test/srt/test_hidden_states.py
+++ b/test/srt/test_hidden_states.py
@@ -4,10 +4,10 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 import sglang as sgl
-from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase


-class TestHiddenState(unittest.TestCase):
+class TestHiddenState(CustomTestCase):
    def test_return_hidden_states(self):
        prompts = ["Today is", "Today is a sunny day and I like"]
        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST

--- a/test/srt/test_input_embeddings.py
+++ b/test/srt/test_input_embeddings.py
@@ -11,11 +11,12 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestInputEmbeds(unittest.TestCase):
+class TestInputEmbeds(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST

--- a/test/srt/test_int8_kernel.py
+++ b/test/srt/test_int8_kernel.py
@@ -6,6 +6,7 @@ import torch
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
 from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.test.test_utils import CustomTestCase


 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
@@ -71,7 +72,7 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
    ).sum(dim=1)


-class TestW8A8Int8FusedMoE(unittest.TestCase):
+class TestW8A8Int8FusedMoE(CustomTestCase):
    DTYPES = [torch.half, torch.bfloat16]
    M = [1, 33]
    N = [128, 1024]

--- a/test/srt/test_json_constrained.py
+++ b/test/srt/test_json_constrained.py
@@ -16,6 +16,7 @@ from sglang.test.test_utils import (
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
    popen_launch_server,
 )

@@ -50,7 +51,7 @@ def setup_class(cls, backend: str):
    )


-class TestJSONConstrainedOutlinesBackend(unittest.TestCase):
+class TestJSONConstrainedOutlinesBackend(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        setup_class(cls, backend="outlines")

--- a/test/srt/test_large_max_new_tokens.py
+++ b/test/srt/test_large_max_new_tokens.py
@@ -17,11 +17,12 @@ from sglang.test.test_utils import (
    DEFAULT_URL_FOR_TEST,
    STDERR_FILENAME,
    STDOUT_FILENAME,
+    CustomTestCase,
    popen_launch_server,
 )


-class TestLargeMaxNewTokens(unittest.TestCase):
+class TestLargeMaxNewTokens(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST