test_large_max_new_tokens.py 2.78 KB
Newer Older
1
2
3
4
"""
python3 -m unittest test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion
"""

5
6
7
8
9
10
11
12
import os
import unittest
from concurrent.futures import ThreadPoolExecutor

import openai

from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process
13
14
from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
15
16
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
17
18
    popen_launch_server,
)
19
20


21
class TestLargeMaxNewTokens(unittest.TestCase):
22
23
24
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
25
        cls.base_url = DEFAULT_URL_FOR_TEST
26
        cls.api_key = "sk-123456"
27
28
29
30

        cls.stdout = open("stdout.txt", "w")
        cls.stderr = open("stderr.txt", "w")

31
32
33
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
34
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
35
            api_key=cls.api_key,
36
            other_args=("--max-total-token", "1024", "--context-len", "8192"),
37
            env={"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256", **os.environ},
38
            return_stdout_stderr=(cls.stdout, cls.stderr),
39
40
41
42
43
44
45
        )
        cls.base_url += "/v1"
        cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST)

    @classmethod
    def tearDownClass(cls):
        kill_child_process(cls.process.pid)
46
47
48
49
        cls.stdout.close()
        cls.stderr.close()
        os.remove("stdout.txt")
        os.remove("stderr.txt")
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

    def run_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
        response = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant"},
                {
                    "role": "user",
                    "content": "Please repeat the world 'hello' for 10000 times.",
                },
            ],
            temperature=0,
        )
        return response

    def test_chat_completion(self):
        num_requests = 4

        futures = []
70
        with ThreadPoolExecutor(num_requests) as executor:
71
            # Send multiple requests
72
73
74
            for i in range(num_requests):
                futures.append(executor.submit(self.run_chat_completion))

75
76
77
78
79
80
81
82
83
84
85
            # Ensure that they are running concurrently
            pt = 0
            while pt >= 0:
                lines = open("stderr.txt").readlines()
                for line in lines[pt:]:
                    print(line, end="", flush=True)
                    if f"#running-req: {num_requests}" in line:
                        all_requests_running = True
                        pt = -1
                        break
                    pt += 1
86
87
88
89
90
91

        assert all_requests_running


if __name__ == "__main__":
    unittest.main()