[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425)

Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py.

[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425)
Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py.
350f9e10 · Cyrus Leung · GitHub · 702bee46 · 350f9e10 · 350f9e10
Unverified Commit 350f9e10 authored May 13, 2024 by Cyrus Leung Committed by GitHub May 13, 2024
13 changed files
--- a/tests/model_executor/__init__.py
+++ b/tests/model_executor/__init__.py
--- a/tests/models/__init__.py
+++ b/tests/models/__init__.py
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -13,9 +13,10 @@ import os
 import pytest
 import torch

-from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS

+from .utils import check_logprobs_close
+
 os.environ["TOKENIZERS_PARALLELISM"] = "true"

 MAX_MODEL_LEN = 1024

--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -15,9 +15,10 @@ from dataclasses import dataclass
 import pytest
 import torch

-from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS

+from .utils import check_logprobs_close
+
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 marlin_not_supported = (capability <

--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest

-from tests.models.utils import check_logprobs_close
+from .utils import check_logprobs_close

 MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",

--- a/tests/prefix_caching/__init__.py
+++ b/tests/prefix_caching/__init__.py
--- a/tests/quantization/__init__.py
+++ b/tests/quantization/__init__.py
--- a/tests/samplers/__init__.py
+++ b/tests/samplers/__init__.py
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
 import pytest
 import torch

-from tests.conftest import VllmRunner
 from vllm import SamplingParams

+from ..conftest import VllmRunner
+
 MODELS = ["facebook/opt-125m"]



--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -9,7 +9,6 @@ import torch
 from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
                    nvmlInit)

-from tests.conftest import cleanup
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -21,6 +20,8 @@ from vllm.sequence import Logprob, MultiModalData
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid

+from ...conftest import cleanup
+

 class AsyncLLM:
    """AsyncLLM

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -9,12 +9,13 @@ import pytest
 import ray
 import torch

-from tests.entrypoints.test_openai_server import ServerRunner
 from vllm import SamplingParams
 from vllm.model_executor.model_loader.tensorizer import (
    EncryptionParams, TensorizerConfig, TensorSerializer,
    is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream)

+from ..utils import ServerRunner
+
 prompts = [
    "Hello, my name is",
    "The president of the United States is",

--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
 import pytest

-from tests.core.utils import create_dummy_prompt
 from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput,
                           SequenceData, SequenceOutput)

+from .core.utils import create_dummy_prompt
+

 @pytest.fixture
 def sample_outputs():

--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
+import os
+import subprocess
+import sys
+import time
+
 import ray
+import requests

 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
 from vllm.utils import get_open_port

+# Path to root of repository so that utilities can be imported by ray workers
+VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+
+
+@ray.remote(num_gpus=1)
+class ServerRunner:
+    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+
+    def __init__(self, args):
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.proc = subprocess.Popen(
+            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_server()
+
+    def ready(self):
+        return True
+
+    def _wait_for_server(self):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(
+                        "http://localhost:8000/health").status_code == 200:
+                    break
+            except Exception as err:
+                if self.proc.poll() is not None:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > self.MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
+
+    def __del__(self):
+        if hasattr(self, "proc"):
+            self.proc.terminate()
+

 def init_test_distributed_environment(
    tp_size: int,
@@ -28,7 +77,7 @@ def multi_process_tensor_parallel(
 ) -> None:
    # Using ray helps debugging the error when it failed
    # as compared to multiprocessing.
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})

    distributed_init_port = get_open_port()
    refs = []