test_eagle_infer.py 6.03 KB
Newer Older
1
import random
2
import threading
3
import time
4
import unittest
5
from types import SimpleNamespace
6

7
import requests
8

9
import sglang as sgl
10
from sglang.srt.hf_transformers_utils import get_tokenizer
11
from sglang.srt.utils import kill_process_tree
12
from sglang.test.few_shot_gsm8k import run_eval
13
from sglang.test.test_utils import (
14
15
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
16
17
18
19
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
)
20
21
22
23
24
25
26
27


class TestEAGLEEngine(unittest.TestCase):

    def test_eagle_accuracy(self):
        prompt = "Today is a sunny day and I like"
        sampling_params = {"temperature": 0, "max_new_tokens": 8}

28
29
30
31
32
33
        # Get the reference output
        ref_engine = sgl.Engine(model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
        ref_output = ref_engine.generate(prompt, sampling_params)["text"]
        ref_engine.shutdown()

        # Launch EAGLE engine
34
        engine = sgl.Engine(
35
36
            model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
            speculative_draft_model_path=DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
37
            speculative_algorithm="EAGLE",
38
39
40
41
            speculative_num_steps=5,
            speculative_eagle_topk=8,
            speculative_num_draft_tokens=64,
            mem_fraction_static=0.7,
42
43
        )

44
45
46
47
        # Case 1: Test the output of EAGLE engine is the same as normal engine
        out1 = engine.generate(prompt, sampling_params)["text"]
        print(f"{out1=}, {ref_output=}")
        self.assertEqual(out1, ref_output)
48

49
        # Case 2: Test the output of EAGLE engine does not contain unexpected EOS
50
51
52
53
54
55
56
        prompt = "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like [/INST]"
        sampling_params = {
            "temperature": 0,
            "max_new_tokens": 1024,
            "skip_special_tokens": False,
        }

57
58
59
60
        tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
        out2 = engine.generate(prompt, sampling_params)["text"]
        print(f"{out2=}")
        tokens = tokenizer.encode(out2, truncation=False)
61
62
        assert tokenizer.eos_token_id not in tokens

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        # Case 3: Batched prompts
        prompts = [
            "Hello, my name is",
            "The president of the United States is",
            "The capital of France is",
            "The future of AI is",
        ]
        sampling_params = {"temperature": 0, "max_new_tokens": 30}
        outputs = engine.generate(prompts, sampling_params)
        for prompt, output in zip(prompts, outputs):
            print("===============================")
            print(f"Prompt: {prompt}\nGenerated text: {output['text']}")

        # Shutdown the engine
        engine.shutdown()

79

80
81
82
83
84
85
86
87
88
prompts = [
    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
    '[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]',
    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nSummarize Russell Brunson's Perfect Webinar Script...[/INST]",
    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nwho are you?[/INST]",
    "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nwhere are you from?[/INST]",
]


89
class TestEAGLEServer(unittest.TestCase):
90
91
92
93
    @classmethod
    def setUpClass(cls):
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.process = popen_launch_server(
94
            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
95
96
97
98
99
100
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--speculative-algorithm",
                "EAGLE",
                "--speculative-draft-model-path",
101
                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
102
                "--speculative-num-steps",
103
                "5",
104
                "--speculative-eagle-topk",
105
                "8",
106
                "--speculative-num-draft-tokens",
107
108
109
                "64",
                "--mem-fraction-static",
                "0.7",
110
111
112
113
114
115
116
            ],
        )

    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
    def send_request(self):
        time.sleep(random.uniform(0, 2))
        for prompt in prompts:
            url = self.base_url + "/generate"
            data = {
                "text": prompt,
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 1024,
                },
            }
            response = requests.post(url, json=data)
            assert response.status_code == 200

    def send_requests_abort(self):
        for prompt in prompts:
            try:
                time.sleep(random.uniform(0, 2))
                url = self.base_url + "/generate"
                data = {
                    "model": "base",
                    "text": prompt,
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": 1024,
                    },
                }
                # set timeout = 1s,mock disconnected
                requests.post(url, json=data, timeout=1)
            except Exception as e:
                print(e)
                pass

    def test_request_abort(self):
151
        concurrency = 4
152
153
        threads = [
            threading.Thread(target=self.send_request) for _ in range(concurrency)
154
        ] + [
155
            threading.Thread(target=self.send_requests_abort)
156
157
            for _ in range(concurrency)
        ]
158
        for worker in threads:
159
            worker.start()
160
        for p in threads:
161
162
            p.join()

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    def test_gsm8k(self):
        args = SimpleNamespace(
            num_shots=5,
            data_path=None,
            num_questions=200,
            max_new_tokens=512,
            parallel=128,
            host="http://127.0.0.1",
            port=int(self.base_url.split(":")[-1]),
        )
        metrics = run_eval(args)
        print(f"{metrics=}")

        self.assertGreater(metrics["accuracy"], 0.20)

178

179
180
if __name__ == "__main__":
    unittest.main()