[Tests] Disable retries and use context manager for openai client (#7565)

39178c7f · Nick Hill · GitHub · 2eedede8 · 39178c7f · 39178c7f
Unverified Commit 39178c7f authored Aug 26, 2024 by Nick Hill Committed by GitHub Aug 26, 2024
15 changed files
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 from ..utils import VLLM_PATH, RemoteOpenAIServer
@@ -31,9 +32,10 @@ def server():
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -2,6 +2,7 @@ from typing import Dict, List
 import openai
 import pytest
+import pytest_asyncio
 from vllm.assets.audio import AudioAsset
 from vllm.multimodal.utils import encode_audio_base64, fetch_audio
@@ -28,9 +29,10 @@ def server():
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.fixture(scope="session")

--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -2,6 +2,7 @@ from http import HTTPStatus
 import openai
 import pytest
+import pytest_asyncio
 import requests
 from vllm.version import __version__ as VLLM_VERSION
@@ -28,9 +29,10 @@ def server():
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -6,6 +6,7 @@ from typing import Dict, List, Optional
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 import torch
 from openai import BadRequestError
@@ -46,9 +47,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -8,6 +8,7 @@ from typing import Dict, List, Optional
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
@@ -89,11 +90,17 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
 @pytest.fixture(scope="module",
                params=["", "--disable-frontend-multiprocessing"])
-def client(default_server_args, request):
+def server(default_server_args, request):
    if request.param:
        default_server_args.append(request.param)
    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
-        yield remote_server.get_async_client()
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -3,6 +3,7 @@ import base64
 import numpy as np
 import openai
 import pytest
+import pytest_asyncio
 from ...utils import RemoteOpenAIServer
@@ -24,10 +25,10 @@ def embedding_server():
        yield remote_server
-@pytest.mark.asyncio
+@pytest_asyncio.fixture
-@pytest.fixture(scope="module")
+async def embedding_client(embedding_server):
-def embedding_client(embedding_server):
+    async with embedding_server.get_async_client() as async_client:
-    return embedding_server.get_async_client()
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
 import openai
 import pytest
+import pytest_asyncio
 from ...utils import RemoteOpenAIServer
@@ -18,9 +19,10 @@ def server():
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -6,6 +6,7 @@ from http import HTTPStatus
 import openai
 import pytest
+import pytest_asyncio
 import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
@@ -35,11 +36,17 @@ def default_server_args():
                    "--enable-chunked-prefill",
                    "--disable-frontend-multiprocessing",
                ])
-def client(default_server_args, request):
+def server(default_server_args, request):
    if request.param:
        default_server_args.append(request.param)
    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
-        yield remote_server.get_async_client()
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as cl:
+        yield cl
 _PROMPT = "Hello my name is Robert and I love magic"

--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
@@ -43,9 +44,10 @@ def server(zephyr_lora_files):
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -25,59 +25,63 @@ def server_with_return_tokens_as_token_ids_flag(
 @pytest.mark.asyncio
 async def test_completion_return_tokens_as_token_ids_completion(
        server_with_return_tokens_as_token_ids_flag):
-    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
+    ) as client:
-    completion = await client.completions.create(
+        completion = await client.completions.create(
-        model=MODEL_NAME,
+            model=MODEL_NAME,
-        # Include Unicode characters to test for dividing a single
+            # Include Unicode characters to test for dividing a single
-        # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
-        # Zephyr tokenizer
+            # Zephyr tokenizer
-        prompt="Say 'Hello, world! 🎉'",
+            prompt="Say 'Hello, world! 🎉'",
-        echo=True,
+            echo=True,
-        temperature=0,
+            temperature=0,
-        max_tokens=10,
+            max_tokens=10,
-        logprobs=1)
+            logprobs=1)
-    text = completion.choices[0].text
+        text = completion.choices[0].text
-    token_strs = completion.choices[0].logprobs.tokens
+        token_strs = completion.choices[0].logprobs.tokens
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # Check that the token representations are consistent between raw tokens
+        # Check that the token representations are consistent between raw
-    # and top_logprobs
+        # tokens and top_logprobs
-    # Slice off the first one, because there's no scoring associated with BOS
+        # Slice off the first one, because there's no scoring associated
-    top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
+        # with BOS
-    top_logprob_keys = [
+        top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
-        next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
+        top_logprob_keys = [
-    ]
+            next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
-    assert token_strs[1:] == top_logprob_keys
+        ]
+        assert token_strs[1:] == top_logprob_keys
-    # Check that decoding the tokens gives the expected text
+        # Check that decoding the tokens gives the expected text
-    tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
+        tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
-    assert text == tokenizer.decode(tokens, skip_special_tokens=True)
+        assert text == tokenizer.decode(tokens, skip_special_tokens=True)
 @pytest.mark.asyncio
 async def test_chat_return_tokens_as_token_ids_completion(
        server_with_return_tokens_as_token_ids_flag):
-    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    response = await client.chat.completions.create(
+    ) as client:
-        model=MODEL_NAME,
+        response = await client.chat.completions.create(
-        # Include Unicode characters to test for dividing a single
+            model=MODEL_NAME,
-        # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # Include Unicode characters to test for dividing a single
-        # Zephyr tokenizer
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
-        messages=[{
+            # Zephyr tokenizer
-            "role": "system",
+            messages=[{
-            "content": "You like to respond in only emojis, like 🎉"
+                "role": "system",
-        }, {
+                "content": "You like to respond in only emojis, like 🎉"
-            "role": "user",
+            }, {
-            "content": "Please write some emojis: 🐱🐶🎉"
+                "role": "user",
-        }],
+                "content": "Please write some emojis: 🐱🐶🎉"
-        temperature=0,
+            }],
-        max_tokens=8,
+            temperature=0,
-        logprobs=True)
+            max_tokens=8,
+            logprobs=True)
-    text = response.choices[0].message.content
+        text = response.choices[0].message.content
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    token_ids = []
+        token_ids = []
-    for logprob_content in response.choices[0].logprobs.content:
+        for logprob_content in response.choices[0].logprobs.content:
-        token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
+            token_ids.append(
-    assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
+                int(logprob_content.token.removeprefix("token_id:")))
+        assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -35,13 +35,14 @@ async def test_shutdown_on_engine_failure(tmp_path):
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        client = remote_server.get_async_client()
+        async with remote_server.get_async_client() as client:
-        with pytest.raises(openai.APIConnectionError):
+            with pytest.raises(
-            # This crashes the engine
+                (openai.APIConnectionError, openai.InternalServerError)):
-            await client.completions.create(model="bad-adapter",
+                # This crashes the engine
-                                            prompt="Hello, my name is")
+                await client.completions.create(model="bad-adapter",
+                                                prompt="Hello, my name is")
-        # Now the server should shut down
+            # Now the server should shut down
-        return_code = remote_server.proc.wait(timeout=1)
+            return_code = remote_server.proc.wait(timeout=3)
-        assert return_code is not None
+            assert return_code is not None
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 import requests
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -42,9 +43,10 @@ def tokenizer_name(model_name: str,
        model_name == "zephyr-lora2") else model_name
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -2,6 +2,7 @@ from typing import Dict, List
 import openai
 import pytest
+import pytest_asyncio
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -36,9 +37,10 @@ def server():
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.fixture(scope="session")

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -28,12 +28,12 @@ async def completions_with_server_args(prompts: List[str], model_name: str,
    outputs = None
    with RemoteOpenAIServer(model_name, server_cli_args) as server:
-        client = server.get_async_client()
+        async with server.get_async_client() as client:
-        outputs = await client.completions.create(model=model_name,
+            outputs = await client.completions.create(model=model_name,
-                                                  prompt=prompts,
+                                                      prompt=prompts,
-                                                  temperature=0,
+                                                      temperature=0,
-                                                  stream=False,
+                                                      stream=False,
-                                                  max_tokens=5)
+                                                      max_tokens=5)
    assert outputs is not None
    return outputs

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -154,6 +154,7 @@ class RemoteOpenAIServer:
        return openai.AsyncOpenAI(
            base_url=self.url_for("v1"),
            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
        )