test_nightly_human_eval.py 3.38 KB
Newer Older
Yineng Zhang's avatar
Yineng Zhang committed
1
2
3
4
5
6
import os
import shutil
import signal
import subprocess
import unittest

7
from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper
Yineng Zhang's avatar
Yineng Zhang committed
8

9
from sglang.srt.utils import kill_process_tree
Yineng Zhang's avatar
Yineng Zhang committed
10
11
12
13
14
from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
15
    DEFAULT_MODEL_NAME_FOR_TEST,
Yineng Zhang's avatar
Yineng Zhang committed
16
    DEFAULT_URL_FOR_TEST,
17
    CustomTestCase,
18
    is_in_ci,
Yineng Zhang's avatar
Yineng Zhang committed
19
20
21
)


22
class TestNightlyHumanEval(CustomTestCase):
Yineng Zhang's avatar
Yineng Zhang committed
23
24
    @classmethod
    def setUpClass(cls):
25
26
27
28
29
30
31
32
33
34
35
36
37
        if is_in_ci():
            cls.model_groups = [([DEFAULT_MODEL_NAME_FOR_TEST], False, False)]
        else:
            cls.model_groups = [
                (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
                (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
                (
                    parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1),
                    True,
                    False,
                ),
                (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
            ]
Yineng Zhang's avatar
Yineng Zhang committed
38
39
40
41
42
43
44
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = None
        cls.eval_process = None

    @classmethod
    def tearDownClass(cls):
        if cls.process:
45
            kill_process_tree(cls.process.pid)
Yineng Zhang's avatar
Yineng Zhang committed
46
        if cls.eval_process:
47
            kill_process_tree(cls.eval_process.pid)
Yineng Zhang's avatar
Yineng Zhang committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

    def run_evalplus(self, model):
        print("Delete evalplus results")
        shutil.rmtree("evalplus_results", ignore_errors=True)
        cmd = [
            "evalplus.evaluate",
            "--model",
            model,
            "--dataset",
            "humaneval",
            "--backend",
            "openai",
            "--base-url",
            "http://localhost:6157/v1",
            "--greedy",
        ]

        try:
            self.eval_process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                preexec_fn=os.setsid,
            )

            stdout, stderr = self.eval_process.communicate(timeout=600)

            if self.eval_process.returncode != 0:
                print(f"Fail to human eval model={model} err={stderr}")

            print("=" * 42)
            print(stdout)
            print("=" * 42)
        except subprocess.TimeoutExpired:
            if self.eval_process:
                os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
            print(f"Timeout during evaluation for model={model}")
        except Exception as e:
            print(f"Error running evalplus for model={model} {str(e)}")
            if self.eval_process:
                os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)

    def test_human_eval_all_models(self):
        for model_group, is_fp8, is_tp2 in self.model_groups:
            for model in model_group:
                # NOTE: only Llama for now
                if "Llama" in model:
                    with self.subTest(model=model):
97
                        self.process = popen_launch_server_wrapper(
98
99
                            self.base_url, model, is_fp8, is_tp2
                        )
Yineng Zhang's avatar
Yineng Zhang committed
100
101
102
103
104
105
                        self.run_evalplus(model)
                        self.tearDownClass()


if __name__ == "__main__":
    unittest.main()