test_large_max_new_tokens.py 3.05 KB
Newer Older
1
2
3
4
"""
python3 -m unittest test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion
"""

5
import os
Lianmin Zheng's avatar
Lianmin Zheng committed
6
import time
7
8
9
10
11
12
13
import unittest
from concurrent.futures import ThreadPoolExecutor

import openai

from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process
14
from sglang.test.test_utils import (
Lianmin Zheng's avatar
Lianmin Zheng committed
15
    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
16
17
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
18
19
    STDERR_FILENAME,
    STDOUT_FILENAME,
20
21
    popen_launch_server,
)
22
23


24
class TestLargeMaxNewTokens(unittest.TestCase):
25
26
    @classmethod
    def setUpClass(cls):
Lianmin Zheng's avatar
Lianmin Zheng committed
27
        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
28
        cls.base_url = DEFAULT_URL_FOR_TEST
29
        cls.api_key = "sk-123456"
30

31
32
        cls.stdout = open(STDOUT_FILENAME, "w")
        cls.stderr = open(STDERR_FILENAME, "w")
33

34
35
36
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
37
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
38
            api_key=cls.api_key,
Lianmin Zheng's avatar
Lianmin Zheng committed
39
40
            other_args=(
                "--max-total-token",
41
                "1536",
Lianmin Zheng's avatar
Lianmin Zheng committed
42
43
44
45
46
                "--context-len",
                "8192",
                "--decode-log-interval",
                "2",
            ),
47
            env={"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256", **os.environ},
48
            return_stdout_stderr=(cls.stdout, cls.stderr),
49
50
        )
        cls.base_url += "/v1"
Lianmin Zheng's avatar
Lianmin Zheng committed
51
        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
52
53
54

    @classmethod
    def tearDownClass(cls):
Lianmin Zheng's avatar
Lianmin Zheng committed
55
        kill_child_process(cls.process.pid, include_self=True)
56
57
        cls.stdout.close()
        cls.stderr.close()
58
59
        os.remove(STDOUT_FILENAME)
        os.remove(STDERR_FILENAME)
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

    def run_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        response = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant"},
                {
                    "role": "user",
                    "content": "Please repeat the world 'hello' for 10000 times.",
                },
            ],
            temperature=0,
        )
        return response

    def test_chat_completion(self):
        num_requests = 4

        futures = []
80
        with ThreadPoolExecutor(num_requests) as executor:
81
            # Send multiple requests
82
83
84
            for i in range(num_requests):
                futures.append(executor.submit(self.run_chat_completion))

85
86
87
            # Ensure that they are running concurrently
            pt = 0
            while pt >= 0:
Lianmin Zheng's avatar
Lianmin Zheng committed
88
                time.sleep(5)
89
                lines = open(STDERR_FILENAME).readlines()
90
91
92
93
94
95
96
                for line in lines[pt:]:
                    print(line, end="", flush=True)
                    if f"#running-req: {num_requests}" in line:
                        all_requests_running = True
                        pt = -1
                        break
                    pt += 1
97
98
99
100
101
102

        assert all_requests_running


if __name__ == "__main__":
    unittest.main()