[ci] set timeout for test_oot_registration.py (#7082)

80694951 · youkaichao · GitHub · c16eaac5 · 80694951 · 80694951
Unverified Commit 80694951 authored Aug 02, 2024 by youkaichao Committed by GitHub Aug 02, 2024
Showing with 10 additions and 2 deletions

tests/entrypoints/openai/test_oot_registration.py tests/entrypoints/openai/test_oot_registration.py +4 -0

vllm/worker/worker.py vllm/worker/worker.py +3 -1

vllm/worker/xpu_worker.py vllm/worker/xpu_worker.py +3 -1

No files found.
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -36,10 +36,12 @@ def test_oot_registration_for_api_server():
    ctx = torch.multiprocessing.get_context()
    server = ctx.Process(target=server_function, args=(port, ))
    server.start()
+    MAX_SERVER_START_WAIT_S = 60
    client = OpenAI(
        base_url=f"http://localhost:{port}/v1",
        api_key="token-abc123",
    )
+    now = time.time()
    while True:
        try:
            completion = client.chat.completions.create(
@@ -57,6 +59,8 @@ def test_oot_registration_for_api_server():
        except OpenAIError as e:
            if "Connection error" in str(e):
                time.sleep(3)
+                if time.time() - now > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError("Server did not start in time") from e
            else:
                raise e
    server.kill()

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -186,7 +186,9 @@ class Worker(LocalOrDistributedWorkerBase):
        # GPU did not change their memory usage during the profiling.
        peak_memory = self.init_gpu_memory - free_gpu_memory
        assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
            "not properly cleaned up before initializing the vLLM instance.")

        cache_block_size = self.get_cache_block_size_bytes()

--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -138,7 +138,9 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
        # GPU did not change their memory usage during the profiling.
        peak_memory = self.init_gpu_memory - free_gpu_memory
        assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
            "not properly cleaned up before initializing the vLLM instance.")

        cache_block_size = self.get_cache_block_size_bytes()