Merge branch 'main' of http://10.6.10.68/dcutoolkit/deeplearing/vllm

6640dc0b · zhuwenwen · 44d4d334 · 83e4e0fe · 6640dc0b · 6640dc0b
Commit 6640dc0b authored Jun 20, 2024 by zhuwenwen
20 changed files
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -6,10 +6,11 @@ import torch
 import torch.distributed
 from vllm.distributed.communication_op import (  # noqa
-    graph_capture, tensor_model_parallel_all_reduce)
+    tensor_model_parallel_all_reduce)
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
                                             init_distributed_environment)
 from vllm.utils import update_environment_variables
@@ -53,7 +54,8 @@ def worker_fn_wrapper(fn):
 @worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
    tensor = torch.ones(16, 1024, 1024,
                        dtype=torch.float32).cuda(pynccl_comm.rank)
    with pynccl_comm.change_state(enable=True):
@@ -129,7 +131,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
    with torch.no_grad():
        graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator()
+        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                         device=get_world_group().device)
        # run something in the default stream to initialize torch engine
        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
        torch.cuda.synchronize()
@@ -154,7 +157,8 @@ def test_pynccl_with_cudagraph():
 @worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator()
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
    if pynccl_comm.rank == 0:
        tensor = torch.ones(16, 1024, 1024,
                            dtype=torch.float32).cuda(pynccl_comm.rank)

--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
+import os
+import ray
+from vllm.utils import cuda_device_count_stateless
+@ray.remote
+class _CUDADeviceCountStatelessTestActor():
+    def get_count(self):
+        return cuda_device_count_stateless()
+    def set_cuda_visible_devices(self, cuda_visible_devices: str):
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+    def get_cuda_visible_devices(self):
+        return os.environ["CUDA_VISIBLE_DEVICES"]
+def test_cuda_device_count_stateless():
+    """Test that cuda_device_count_stateless changes return value if
+    CUDA_VISIBLE_DEVICES is changed."""
+    actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote()
+    assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1"
+    assert ray.get(actor.get_count.remote()) == 2
+    ray.get(actor.set_cuda_visible_devices.remote("0"))
+    assert ray.get(actor.get_count.remote()) == 1
+    ray.get(actor.set_cuda_visible_devices.remote(""))
+    assert ray.get(actor.get_count.remote()) == 0
--- a/tests/entrypoints/test_openai_embedding.py
+++ b/tests/entrypoints/test_openai_embedding.py
+import openai
+import pytest
+import ray
+from ..utils import VLLM_PATH, RemoteOpenAIServer
+EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+pytestmark = pytest.mark.openai
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
+@pytest.fixture(scope="module")
+def embedding_server(ray_ctx):
+    return RemoteOpenAIServer([
+        "--model",
+        EMBEDDING_MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ])
+@pytest.mark.asyncio
+@pytest.fixture(scope="module")
+def embedding_client(embedding_server):
+    return embedding_server.get_async_client()
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
+                                model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+    # test single embedding
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 9
+    assert embeddings.usage.total_tokens == 9
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [EMBEDDING_MODEL_NAME],
+)
+async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
+                               model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 4096
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    embeddings = await embedding_client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 4096
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -15,11 +15,10 @@ from openai import BadRequestError
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
@@ -80,9 +79,15 @@ def zephyr_lora_files():
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def ray_ctx():
-    ray.init()
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    server_runner = ServerRunner.remote([
+    yield
+    ray.shutdown()
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files, ray_ctx):
+    return RemoteOpenAIServer([
        "--model",
        MODEL_NAME,
        # use half precision for speed and memory savings in CI environment
@@ -91,8 +96,6 @@ def server(zephyr_lora_files):
        "--max-model-len",
        "8192",
        "--enforce-eager",
-        "--gpu-memory-utilization",
-        "0.75",
        # lora config below
        "--enable-lora",
        "--lora-modules",
@@ -105,43 +108,14 @@ def server(zephyr_lora_files):
        "--max-num-seqs",
        "128",
    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
 @pytest.fixture(scope="module")
-def embedding_server(zephyr_lora_files):
+def client(server):
-    ray.shutdown()
+    return server.get_async_client()
-    ray.init()
-    server_runner = ServerRunner.remote([
-        "--model",
-        EMBEDDING_MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--enforce-eager",
-        "--gpu-memory-utilization",
-        "0.75",
-        "--max-model-len",
-        "8192",
-    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
-@pytest.fixture(scope="module")
-def client():
-    client = openai.AsyncOpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
-@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI):
-async def test_check_models(server, client: openai.AsyncOpenAI):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
@@ -158,8 +132,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI):
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_single_completion(server, client: openai.AsyncOpenAI,
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-                                 model_name: str):
    completion = await client.completions.create(model=model_name,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
@@ -190,8 +163,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_no_logprobs(server, client: openai.AsyncOpenAI,
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-                           model_name: str):
    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
@@ -210,8 +182,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-                             model_name: str):
    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
@@ -232,8 +203,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_some_logprobs(server, client: openai.AsyncOpenAI,
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-                             model_name: str):
    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
@@ -254,7 +224,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
                                            model_name: str):
    with pytest.raises(
@@ -300,8 +270,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
-async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-                                model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -326,8 +295,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-                                  model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -354,8 +322,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-                                  model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -382,7 +349,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
        "role": "system",
@@ -425,7 +392,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+async def test_single_chat_session(client: openai.AsyncOpenAI,
                                   model_name: str):
    messages = [{
        "role": "system",
@@ -470,7 +437,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+async def test_completion_streaming(client: openai.AsyncOpenAI,
                                    model_name: str):
    prompt = "What is an LLM?"
@@ -505,8 +472,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_chat_streaming(server, client: openai.AsyncOpenAI,
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
-                              model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -555,8 +521,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
    "model_name",
    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_chat_completion_stream_options(server,
+async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
-                                              client: openai.AsyncOpenAI,
                                              model_name: str):
    messages = [{
        "role": "system",
@@ -626,7 +591,7 @@ async def test_chat_completion_stream_options(server,
    "model_name",
    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
 )
-async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                         model_name: str):
    prompt = "What is the capital of France?"
@@ -688,8 +653,7 @@ async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
-async def test_batch_completions(server, client: openai.AsyncOpenAI,
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-                                 model_name: str):
    # test simple list
    batch = await client.completions.create(
        model=model_name,
@@ -737,7 +701,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-async def test_logits_bias(server, client: openai.AsyncOpenAI):
+async def test_logits_bias(client: openai.AsyncOpenAI):
    prompt = "Hello, my name is"
    max_tokens = 5
    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
@@ -786,7 +750,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
                                      guided_decoding_backend: str):
    completion = await client.completions.create(
        model=MODEL_NAME,
@@ -808,7 +772,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                guided_decoding_backend: str):
    messages = [{
        "role": "system",
@@ -855,7 +819,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
                                       guided_decoding_backend: str):
    completion = await client.completions.create(
        model=MODEL_NAME,
@@ -875,7 +839,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_regex_chat(client: openai.AsyncOpenAI,
                                 guided_decoding_backend: str):
    messages = [{
        "role": "system",
@@ -913,7 +877,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
                                        guided_decoding_backend: str):
    completion = await client.completions.create(
        model=MODEL_NAME,
@@ -933,7 +897,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_chat(client: openai.AsyncOpenAI,
                                  guided_decoding_backend: str):
    messages = [{
        "role": "system",
@@ -972,7 +936,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
                                          guided_decoding_backend: str):
    with pytest.raises(openai.BadRequestError):
        _ = await client.completions.create(
@@ -1008,7 +972,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
+async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str):
    messages = [{
        "role": "system",
@@ -1040,7 +1004,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend",
                         ["outlines", "lm-format-enforcer"])
-async def test_named_tool_use(server, client: openai.AsyncOpenAI,
+async def test_named_tool_use(client: openai.AsyncOpenAI,
                              guided_decoding_backend: str):
    messages = [{
        "role": "system",
@@ -1131,7 +1095,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_required_tool_use_not_yet_supported(
-        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -1177,7 +1141,7 @@ async def test_required_tool_use_not_yet_supported(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_inconsistent_tool_choice_and_tools(
-        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+        client: openai.AsyncOpenAI, guided_decoding_backend: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -1223,7 +1187,7 @@ async def test_inconsistent_tool_choice_and_tools(
 @pytest.mark.asyncio
-async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
    for _ in range(2):
        resp = await client.chat.completions.create(
            model=MODEL_NAME,
@@ -1243,7 +1207,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_extra_fields(server, client: openai.AsyncOpenAI):
+async def test_extra_fields(client: openai.AsyncOpenAI):
    with pytest.raises(BadRequestError) as exc_info:
        await client.chat.completions.create(
            model=MODEL_NAME,
@@ -1259,7 +1223,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_complex_message_content(server, client: openai.AsyncOpenAI):
+async def test_complex_message_content(client: openai.AsyncOpenAI):
    resp = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{
@@ -1279,7 +1243,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_custom_role(server, client: openai.AsyncOpenAI):
+async def test_custom_role(client: openai.AsyncOpenAI):
    # Not sure how the model handles custom roles so we just check that
    # both string and complex message content are handled in the same way
@@ -1310,7 +1274,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
-async def test_guided_grammar(server, client: openai.AsyncOpenAI):
+async def test_guided_grammar(client: openai.AsyncOpenAI):
    simple_sql_grammar = """
 start: select_statement
@@ -1351,7 +1315,7 @@ number: "1" | "2"
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
                                       model_name: str, logprobs_arg: int):
    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
    # test using text and token IDs
@@ -1380,7 +1344,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
-async def test_long_seed(server, client: openai.AsyncOpenAI):
+async def test_long_seed(client: openai.AsyncOpenAI):
    for seed in [
            torch.iinfo(torch.long).min - 1,
            torch.iinfo(torch.long).max + 1
@@ -1399,81 +1363,5 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
                or "less_than_equal" in exc_info.value.message)
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
-                                model_name: str):
-    input_texts = [
-        "The chef prepared a delicious meal.",
-    ]
-    # test single embedding
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 9
-    assert embeddings.usage.total_tokens == 9
-    # test using token IDs
-    input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 5
-    assert embeddings.usage.total_tokens == 5
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [EMBEDDING_MODEL_NAME],
-)
-async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
-                               model_name: str):
-    # test List[str]
-    input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
-    ]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 4096
-    # test List[List[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 4096
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 17
-    assert embeddings.usage.total_tokens == 17
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
@@ -8,7 +8,7 @@ import ray
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
-from ..utils import ServerRunner
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
 LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
@@ -25,10 +25,16 @@ TEST_IMAGE_URLS = [
 pytestmark = pytest.mark.openai
+@pytest.fixture(scope="module")
+def ray_ctx():
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    yield
+    ray.shutdown()
 @pytest.fixture(scope="module")
 def server():
-    ray.init()
+    return RemoteOpenAIServer([
-    server_runner = ServerRunner.remote([
        "--model",
        MODEL_NAME,
        "--dtype",
@@ -47,18 +53,11 @@ def server():
        "--chat-template",
        str(LLAVA_CHAT_TEMPLATE),
    ])
-    ray.get(server_runner.ready.remote())
-    yield server_runner
-    ray.shutdown()
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
-def client():
+def client(server):
-    client = openai.AsyncOpenAI(
+    return server.get_async_client()
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
-    yield client
 @pytest_asyncio.fixture(scope="session")
@@ -73,7 +72,7 @@ async def base64_encoded_image() -> Dict[str, str]:
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                         model_name: str, image_url: str):
    messages = [{
        "role":
@@ -126,7 +125,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
-        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        client: openai.AsyncOpenAI, model_name: str, image_url: str,
        base64_encoded_image: Dict[str, str]):
    messages = [{
@@ -180,7 +179,7 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+async def test_chat_streaming_image(client: openai.AsyncOpenAI,
                                    model_name: str, image_url: str):
    messages = [{
        "role":
@@ -237,8 +236,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
-async def test_multi_image_input(server, client: openai.AsyncOpenAI,
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 model_name: str, image_url: str):
+                                 image_url: str):
    messages = [{
        "role":

--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -47,7 +47,7 @@ def cutlass_fp8_gemm_helper(m: int,
    scale_b = (torch.randn(
        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
-    out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b * b.to(dtype=torch.float32)).to(out_dtype)
@@ -74,7 +74,7 @@ def cutlass_int8_gemm_helper(m: int,
    scale_b = (torch.randn(
        (1, n_b_scales), device=device, dtype=torch.float32) / 10)
-    out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype)
+    out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype)
    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b *
                        b.to(dtype=torch.float32)).to(dtype=out_dtype)
@@ -180,11 +180,11 @@ def test_cutlass_subset():
    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
-    out = ops.cutlass_scaled_mm_dq(a,
+    out = ops.cutlass_scaled_mm(a,
-                                   b,
+                                b,
-                                   scale_a,
+                                scale_a,
-                                   scale_b,
+                                scale_b,
-                                   out_dtype=torch.bfloat16)
+                                out_dtype=torch.bfloat16)
    baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
                        scale_b *
                        b.to(dtype=torch.float32)).to(dtype=torch.bfloat16)
@@ -203,8 +203,8 @@ class CutlassLayer(torch.nn.Module):
        self.out_dtype = out_dtype
    def forward(self, a):
-        return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b,
+        return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
-                                        self.out_dtype)
+                                     self.out_dtype)
 @pytest.mark.parametrize("per_act_token", [True, False])

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -12,7 +12,10 @@ from huggingface_hub import snapshot_download
 import vllm
 from vllm.config import LoRAConfig
-from vllm.distributed import destroy_model_parallel, initialize_model_parallel
+from vllm.distributed import (destroy_distributed_environment,
+                              destroy_model_parallel,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
                                               RowParallelLinear)
@@ -35,6 +38,7 @@ LONG_LORA_INFOS = [{
 def cleanup():
    destroy_model_parallel()
+    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
@@ -64,15 +68,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
-    if not torch.distributed.is_initialized():
+    temp_file = tempfile.mkstemp()[1]
-        temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
-        torch.distributed.init_process_group(
+        world_size=1,
-            backend="nccl",
+        rank=0,
-            world_size=1,
+        distributed_init_method=f"file://{temp_file}",
-            rank=0,
+        local_rank=0,
-            init_method=f"file://{temp_file}",
+        backend="nccl",
-        )
+    )
-        torch.distributed.all_reduce(torch.zeros(1).cuda())
    initialize_model_parallel(1, 1)
    yield
    cleanup()

--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
 """
 import pytest
-import torch
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
-aqlm_not_supported = True
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    aqlm_not_supported = (capability <
-                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@@ -67,7 +58,7 @@ ground_truth_generations = [
 ]
-@pytest.mark.skipif(aqlm_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                    reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -8,8 +8,8 @@ import pytest
 import torch
 from transformers import AutoTokenizer
+from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -67,16 +67,16 @@ EXPECTED_STRS_MAP = {
    },
 }
-fp8_not_supported = True
-if torch.cuda.is_available():
+# This test compares against golden strings for exact match since
-    capability = torch.cuda.get_device_capability()
+# there is no baseline implementation to compare against
-    capability = capability[0] * 10 + capability[1]
+# and is unstable w.r.t specifics of the fp8 implementation or
-    fp8_not_supported = (capability <
+# the hardware being run on.
-                         QUANTIZATION_METHODS["fp8"].get_min_capability())
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
-@pytest.mark.skipif(fp8_not_supported,
+    "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])

--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
 import os
 import pytest
-import torch
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 from .utils import check_logprobs_close
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
-gptq_marlin_not_supported = True
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    gptq_marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
 MODELS = [
    # act_order==False, group_size=channelwise
    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
@@ -53,7 +44,7 @@ MODELS = [
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(gptq_marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])

--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
 from dataclasses import dataclass
 import pytest
-import torch
 from tests.models.utils import check_logprobs_close
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
-marlin_not_supported = True
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass
@@ -47,7 +38,7 @@ model_pairs = [
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                    reason="Marlin24 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
 from dataclasses import dataclass
 import pytest
-import torch
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
 from .utils import check_logprobs_close
-marlin_not_supported = True
-if torch.cuda.is_available():
-    capability = torch.cuda.get_device_capability()
-    capability = capability[0] * 10 + capability[1]
-    marlin_not_supported = (
-        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass
 class ModelPair:
@@ -45,7 +36,7 @@ model_pairs = [
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(marlin_not_supported,
+@pytest.mark.skipif(not is_quant_method_supported("marlin"),
                    reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])

--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
 import pytest
 import torch
+from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-@pytest.mark.skipif(
+                    reason='bitsandbytes is not supported on this GPU type.')
-    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
-    reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
    with vllm_runner('huggyllama/llama-7b',
                     quantization='bitsandbytes',

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -3,12 +3,13 @@
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
+import pytest
 import torch
 from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsLinearMethod, CompressedTensorsW4A16,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsW8A8DynamicToken, CompressedTensorsW8A8StaticTensor)
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@@ -60,3 +61,25 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
        assert qkv_proj.weight.dtype is torch.int8
+@pytest.mark.parametrize("w4a16_args", [
+    ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None),
+    ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
+])
+def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
+    model, strategy, group = w4a16_args
+    with vllm_runner(model) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+        qkv_proj = layer.self_attn.qkv_proj
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16)
+        assert qkv_proj.scheme.strategy == strategy
+        assert qkv_proj.scheme.group_size == group
+        assert qkv_proj.weight_packed.dtype is torch.int32
+        assert qkv_proj.weight_scale.dtype is torch.float16
+        assert qkv_proj.weight_packed.pack_factor == 8
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -5,16 +5,13 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 import pytest
 import torch
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from tests.quantization.utils import is_quant_method_supported
+from vllm._custom_ops import scaled_fp8_quant
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-@pytest.mark.skipif(
+                    reason="FP8 is not supported on this GPU type.")
-    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
-    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
@@ -22,3 +19,48 @@ def test_load_fp16_model(vllm_runner) -> None:
        fc1 = model.model.decoder.layers[0].fc1
        assert isinstance(fc1.quant_method, Fp8LinearMethod)
        assert fc1.weight.dtype == torch.float8_e4m3fn
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="FP8 is not supported on this GPU type.")
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant(dtype) -> None:
+    def quantize_ref(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
+                                                           max=finfo.max)
+        qweight = qweight.to(torch.float8_e4m3fn)
+        return qweight
+    def per_tensor_dequantize(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+    # Note that we use a shape % 4 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 4.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+    # Dynamic quantization
+    ref_y, inv_scale = scaled_fp8_quant(x, None)
+    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
+    # Reference dynamic quantizaton
+    y = quantize_ref(x, inv_scale)
+    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+    # Static quantization
+    y, _ = scaled_fp8_quant(x, inv_scale)
+    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+    # Padding
+    y, _ = scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    assert y.shape[0] == 17
+    assert torch.allclose(
+        ref_y,
+        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
+                              dtype))
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
+import torch
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+def is_quant_method_supported(quant_method: str) -> bool:
+    # Currently, all quantization methods require Nvidia or AMD GPUs
+    if not torch.cuda.is_available():
+        return False
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    return (capability <
+            QUANTIZATION_METHODS[quant_method].get_min_capability())
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
 import json
 import os
+import pathlib
 import subprocess
 from unittest.mock import MagicMock, patch
 import openai
 import pytest
 import ray
+import torch
+from tensorizer import EncryptionParams
 from vllm import SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                         TensorSerializer,
                                                         is_vllm_tensorized,
                                                         load_with_tensorizer,
                                                         open_stream,
-                                                         serialize_vllm_model)
+                                                         serialize_vllm_model,
+                                                         tensorize_vllm_model)
-from ..utils import ServerRunner
+from ..conftest import VllmRunner, cleanup
+from ..utils import RemoteOpenAIServer
 # yapf conflicts with isort for this docstring
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
@@ -42,6 +49,20 @@ def is_curl_installed():
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False
+def get_torch_model(vllm_runner: VllmRunner):
+    return vllm_runner \
+            .model \
+            .llm_engine \
+            .model_executor \
+            .driver_worker \
+            .model_runner \
+            .model
+def write_keyfile(keyfile_path: str):
+    encryption_params = EncryptionParams.random()
+    pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(keyfile_path, 'wb') as f:
+        f.write(encryption_params.key)
 @pytest.fixture(autouse=True)
 def tensorizer_config():
@@ -88,12 +109,17 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        key_path = tmp_path / (model_ref + ".key")
+        write_keyfile(key_path)
        outputs = vllm_model.generate(prompts, sampling_params)
-        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
+        config_for_serializing = TensorizerConfig(
-        serialize_vllm_model(vllm_model.model.llm_engine,
+            tensorizer_uri=model_path,
-                            config_for_serializing,
+            encryption_keyfile=key_path
-                            encryption_key_path=key_path)
+        )
+        serialize_vllm_model(get_torch_model(vllm_model),
+                            config_for_serializing)
    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)
@@ -145,7 +171,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
-        serialize_vllm_model(vllm_model.model.llm_engine,
+        serialize_vllm_model(get_torch_model(vllm_model),
                            TensorizerConfig(tensorizer_uri=model_path))
    with vllm_runner(
@@ -180,7 +206,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
-        serialize_vllm_model(vllm_model.model.llm_engine,
+        serialize_vllm_model(get_torch_model(vllm_model),
                            TensorizerConfig(tensorizer_uri=model_path))
        model_loader_extra_config = {
@@ -191,18 +217,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    openai_args = [
        "--model", model_ref, "--dtype", "float16", "--load-format",
        "tensorizer", "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config), "--port", "8000"
+        json.dumps(model_loader_extra_config),
    ]
-    server = ServerRunner.remote(openai_args)
+    server = RemoteOpenAIServer(openai_args)
-    assert ray.get(server.ready.remote())
    print("Server ready.")
-    client = openai.OpenAI(
+    client = server.get_client()
-        base_url="http://localhost:8000/v1",
-        api_key="token-abc123",
-    )
    completion = client.completions.create(model=model_ref,
                                           prompt="Hello, my name is",
                                           max_tokens=5,
@@ -224,7 +245,9 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
-def test_tensorizer_with_tp(vllm_runner):
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Requires 2 GPUs")
+def test_tensorizer_with_tp_path_without_template(vllm_runner):
    with pytest.raises(ValueError):
        model_ref = "EleutherAI/pythia-1.4b"
        tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
@@ -238,8 +261,62 @@ def test_tensorizer_with_tp(vllm_runner):
                s3_endpoint="object.ord1.coreweave.com",
            ),
            tensor_parallel_size=2,
+            disable_custom_all_reduce=True,
        )
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Requires 2 GPUs")
+def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
+                                                                    tmp_path):
+    model_ref = "EleutherAI/pythia-1.4b"
+    # record outputs from un-sharded un-tensorized model
+    base_model = vllm_runner(
+        model_ref,
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+    )
+    outputs = base_model.generate(prompts, sampling_params)
+    base_model.model.llm_engine.model_executor.shutdown()
+    del base_model
+    cleanup()
+    ray.shutdown()
+    # load model with two shards and serialize with encryption
+    model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
+    key_path = tmp_path / (model_ref + ".key")
+    tensorizer_config = TensorizerConfig(
+        tensorizer_uri=model_path,
+        encryption_keyfile=key_path,
+    )
+    tensorize_vllm_model(
+        engine_args=EngineArgs(
+                model=model_ref,
+                tensor_parallel_size=2,
+                disable_custom_all_reduce=True,
+                enforce_eager=True,
+            ),
+        tensorizer_config=tensorizer_config,
+    )
+    assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
+    assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
+    cleanup()
+    ray.shutdown()
+    loaded_vllm_model = vllm_runner(
+        model_ref,
+        tensor_parallel_size=2,
+        load_format="tensorizer",
+        disable_custom_all_reduce=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config)
+    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+    assert outputs == deserialized_outputs
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
    model_ref = "facebook/opt-125m"
@@ -248,7 +325,7 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
    with vllm_runner(model_ref) as vllm_model:
        outputs = vllm_model.generate(prompts, sampling_params)
-        serialize_vllm_model(vllm_model.model.llm_engine, config)
+        serialize_vllm_model(get_torch_model(vllm_model), config)
        assert is_vllm_tensorized(config)

--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -39,7 +39,7 @@ def test_filter_subtensors():
    filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
    assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
    for key, tensor in filtered_state_dict.items():
-        # NOTE: don't use `euqal` here, as the tensor might contain NaNs
+        # NOTE: don't use `equal` here, as the tensor might contain NaNs
        assert tensor is state_dict[key]

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,57 +4,109 @@ import sys
 import time
 import warnings
 from contextlib import contextmanager
+from typing import List
+import openai
 import ray
 import requests
 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
+from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.utils import get_open_port
 # Path to root of repository so that utilities can be imported by ray workers
 VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
-@ray.remote(num_gpus=1)
+class RemoteOpenAIServer:
-class ServerRunner:
+    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
-    def __init__(self, args):
+    @ray.remote(num_gpus=1)
-        env = os.environ.copy()
+    class _RemoteRunner:
-        env["PYTHONUNBUFFERED"] = "1"
-        self.proc = subprocess.Popen(
+        def __init__(self, cli_args: List[str], *, wait_url: str,
-            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
+                     wait_timeout: float) -> None:
-            args,
+            env = os.environ.copy()
-            env=env,
+            env["PYTHONUNBUFFERED"] = "1"
-            stdout=sys.stdout,
+            self.proc = subprocess.Popen(
-            stderr=sys.stderr,
+                [
+                    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+                    *cli_args
+                ],
+                env=env,
+                stdout=sys.stdout,
+                stderr=sys.stderr,
+            )
+            self._wait_for_server(url=wait_url, timeout=wait_timeout)
+        def ready(self):
+            return True
+        def _wait_for_server(self, *, url: str, timeout: float):
+            # run health check
+            start = time.time()
+            while True:
+                try:
+                    if requests.get(url).status_code == 200:
+                        break
+                except Exception as err:
+                    if self.proc.poll() is not None:
+                        raise RuntimeError(
+                            "Server exited unexpectedly.") from err
+                    time.sleep(0.5)
+                    if time.time() - start > timeout:
+                        raise RuntimeError(
+                            "Server failed to start in time.") from err
+        def __del__(self):
+            if hasattr(self, "proc"):
+                self.proc.terminate()
+    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+        if auto_port:
+            if "-p" in cli_args or "--port" in cli_args:
+                raise ValueError("You have manually specified the port"
+                                 "when `auto_port=True`.")
+            cli_args = cli_args + ["--port", str(get_open_port())]
+        parser = make_arg_parser()
+        args = parser.parse_args(cli_args)
+        self.host = str(args.host or 'localhost')
+        self.port = int(args.port)
+        self._runner = self._RemoteRunner.remote(
+            cli_args,
+            wait_url=self.url_for("health"),
+            wait_timeout=self.MAX_SERVER_START_WAIT_S)
+        self._wait_until_ready()
+    @property
+    def url_root(self) -> str:
+        return f"http://{self.host}:{self.port}"
+    def url_for(self, *parts: str) -> str:
+        return self.url_root + "/" + "/".join(parts)
+    def _wait_until_ready(self) -> None:
+        ray.get(self._runner.ready.remote())
+    def get_client(self):
+        return openai.OpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
+        )
+    def get_async_client(self):
+        return openai.AsyncOpenAI(
+            base_url=self.url_for("v1"),
+            api_key=self.DUMMY_API_KEY,
        )
-        self._wait_for_server()
-    def ready(self):
-        return True
-    def _wait_for_server(self):
-        # run health check
-        start = time.time()
-        while True:
-            try:
-                if requests.get(
-                        "http://localhost:8000/health").status_code == 200:
-                    break
-            except Exception as err:
-                if self.proc.poll() is not None:
-                    raise RuntimeError("Server exited unexpectedly.") from err
-                time.sleep(0.5)
-                if time.time() - start > self.MAX_SERVER_START_WAIT_S:
-                    raise RuntimeError(
-                        "Server failed to start in time.") from err
-    def __del__(self):
-        if hasattr(self, "proc"):
-            self.proc.terminate()
 def init_test_distributed_environment(

--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
 import pytest
 import torch
-from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -292,6 +293,7 @@ def distributed_init():
        rank=0,
        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
        local_rank=0)
+    ensure_model_parallel_initialized(1, 1)
 @pytest.mark.parametrize("batch_size", list(range(2, 128)))