test_large_max_new_tokens.py 3 KB
Newer Older
1
2
3
4
"""
python3 -m unittest test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion
"""

5
import os
Lianmin Zheng's avatar
Lianmin Zheng committed
6
import time
7
8
9
10
11
12
13
import unittest
from concurrent.futures import ThreadPoolExecutor

import openai

from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process
14
from sglang.test.test_utils import (
Lianmin Zheng's avatar
Lianmin Zheng committed
15
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
16
17
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
18
19
    popen_launch_server,
)
20
21


22
class TestLargeMaxNewTokens(unittest.TestCase):
23
24
    @classmethod
    def setUpClass(cls):
Lianmin Zheng's avatar
Lianmin Zheng committed
25
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
26
        cls.base_url = DEFAULT_URL_FOR_TEST
27
        cls.api_key = "sk-123456"
28
29
30
31

        cls.stdout = open("stdout.txt", "w")
        cls.stderr = open("stderr.txt", "w")

32
33
34
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
35
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
36
            api_key=cls.api_key,
Lianmin Zheng's avatar
Lianmin Zheng committed
37
38
39
40
41
42
43
44
            other_args=(
                "--max-total-token",
                "1024",
                "--context-len",
                "8192",
                "--decode-log-interval",
                "2",
            ),
45
            env={"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256", **os.environ},
46
            return_stdout_stderr=(cls.stdout, cls.stderr),
47
48
        )
        cls.base_url += "/v1"
Lianmin Zheng's avatar
Lianmin Zheng committed
49
        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
50
51
52

    @classmethod
    def tearDownClass(cls):
Lianmin Zheng's avatar
Lianmin Zheng committed
53
        kill_child_process(cls.process.pid, include_self=True)
54
55
56
57
        cls.stdout.close()
        cls.stderr.close()
        os.remove("stdout.txt")
        os.remove("stderr.txt")
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

    def run_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        response = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant"},
                {
                    "role": "user",
                    "content": "Please repeat the world 'hello' for 10000 times.",
                },
            ],
            temperature=0,
        )
        return response

    def test_chat_completion(self):
        num_requests = 4

        futures = []
78
        with ThreadPoolExecutor(num_requests) as executor:
79
            # Send multiple requests
80
81
82
            for i in range(num_requests):
                futures.append(executor.submit(self.run_chat_completion))

83
84
85
            # Ensure that they are running concurrently
            pt = 0
            while pt >= 0:
Lianmin Zheng's avatar
Lianmin Zheng committed
86
                time.sleep(5)
87
88
89
90
91
92
93
94
                lines = open("stderr.txt").readlines()
                for line in lines[pt:]:
                    print(line, end="", flush=True)
                    if f"#running-req: {num_requests}" in line:
                        all_requests_running = True
                        pt = -1
                        break
                    pt += 1
95
96
97
98
99
100

        assert all_requests_running


if __name__ == "__main__":
    unittest.main()