merge v0.5.0

f48954a4 · zhuwenwen · 1dba29d3 · 8f89d720 · f48954a4 · f48954a4
Commit f48954a4 authored Jun 12, 2024 by zhuwenwen
20 changed files
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
-from ..utils import create_seq_group, create_seq_group_encoder_decoder
+from ..utils import (create_dummy_prompt, create_seq_group,
+                     create_seq_group_encoder_decoder)
 @pytest.mark.parametrize("block_size", [16])
@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
    assert num_consumed_blocks == expected_consumed_blocks
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_cpu_blocks", [4])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
+              enable_caching):
+    """Verify blocks number on src/desc device is correct after swapping in/out
+        sequence group (not missing or extra blocks).
+    """
+    block_manager = BlockSpaceManagerV2(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0,
+                                        enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+    # Swap seq group from CPU -> GPU.
+    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    cpu_blocks = block_manager.get_block_table(prompt)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == [cpu_blocks[0]]
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
 @pytest.mark.parametrize("block_size", [8, 16])
 @pytest.mark.parametrize("prompt_len", [10, 300, 1000])
 @pytest.mark.parametrize("num_slots_to_append", [50])

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -42,18 +42,16 @@ def test_models(
    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
    enforce_eager = backend_by_env_var == "FLASHINFER"
-    hf_model = hf_runner(model, dtype=dtype)
+    with hf_runner(model, dtype=dtype) as hf_model:
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with vllm_runner(model,
-    vllm_model = vllm_runner(
+                     dtype=dtype,
-        model,
+                     tensor_parallel_size=2,
-        dtype=dtype,
+                     enforce_eager=enforce_eager,
-        tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
-        enforce_eager=enforce_eager,
+                     ) as vllm_model:
-        distributed_executor_backend=distributed_executor_backend)
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]

--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -45,21 +45,19 @@ def test_models(
    enable_chunked_prefill = True
    max_num_batched_tokens = chunked_prefill_token_size
-    hf_model = hf_runner(model, dtype=dtype)
+    with hf_runner(model, dtype=dtype) as hf_model:
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
-    vllm_model = vllm_runner(
+    with vllm_runner(
-        model,
+            model,
-        dtype=dtype,
+            dtype=dtype,
-        tensor_parallel_size=2,
+            tensor_parallel_size=2,
-        max_num_seqs=max_num_seqs,
+            max_num_seqs=max_num_seqs,
-        enable_chunked_prefill=enable_chunked_prefill,
+            enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
+            max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
+            distributed_executor_backend=distributed_executor_backend,
-    )
+    ) as vllm_model:
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]

--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
+import os
+import torch
+from vllm.distributed.parallel_state import is_in_the_same_node
+torch.distributed.init_process_group(backend="gloo")
+test_result = is_in_the_same_node(torch.distributed.group.WORLD)
+expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+assert test_result == expected, f"Expected {expected}, got {test_result}"
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
-    yield vllm_model
+        yield vllm_model
-    del vllm_model
 def test_stop_reason(vllm_model, example_prompts):

--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -10,7 +10,8 @@ MAX_TOKENS = 200
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
-    return vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 @pytest.mark.skip_global_cleanup

--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -63,7 +63,6 @@ def test_guided_logits_processors():
                                  tokenizer,
                                  whitespace_pattern=None)
-    regex_LP.init_state()
    token_ids = tokenizer.encode(
        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
    tensor = torch.rand(32000)
@@ -72,7 +71,6 @@ def test_guided_logits_processors():
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)
-    json_LP.init_state()
    token_ids = tokenizer.encode(
        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
    tensor = torch.rand(32000)

--- a/tests/entrypoints/test_llm_generate_multiple_loras.py
+++ b/tests/entrypoints/test_llm_generate_multiple_loras.py
+import weakref
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from vllm import LLM
+from vllm.lora.request import LoRARequest
+from ..conftest import cleanup
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+pytestmark = pytest.mark.llm
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              tensor_parallel_size=1,
+              max_model_len=8192,
+              enable_lora=True,
+              max_loras=4,
+              max_lora_rank=64,
+              max_num_seqs=128,
+              enforce_eager=True)
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup()
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+@pytest.mark.skip_global_cleanup
+def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
+    lora_request = [
+        LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
    assert completion.id is not None
    assert completion.choices is not None and len(completion.choices) == 1
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
+    choice = completion.choices[0]
-    assert completion.choices[0].finish_reason == "length"
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
    assert completion.usage == openai.types.CompletionUsage(
        completion_tokens=5, prompt_tokens=6, total_tokens=11)
@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
        max_tokens=5,
        temperature=0.0,
    )
-    assert completion.choices[0].text is not None and len(
+    assert len(completion.choices[0].text) >= 5
-        completion.choices[0].text) >= 5
 @pytest.mark.asyncio
@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras
+    # just test 1 lora hereafter
    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+    [MODEL_NAME, "zephyr-lora"],
 )
 async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
                             model_name: str):
@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
    assert choice.logprobs is not None
    assert choice.logprobs.token_logprobs is not None
    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) <= 1
+    assert len(choice.logprobs.top_logprobs[0]) == 1
 @pytest.mark.asyncio
@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
    assert choice.logprobs is not None
    assert choice.logprobs.token_logprobs is not None
    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) <= 6
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
 @pytest.mark.asyncio
@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
            prompt=[0, 0, 0, 0, 0],
            max_tokens=5,
            temperature=0.0,
-            logprobs=6,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
        )
        ...
    with pytest.raises(
@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
            prompt=[0, 0, 0, 0, 0],
            max_tokens=5,
            temperature=0.0,
-            logprobs=6,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
            stream=True,
        )
        async for chunk in stream:
@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
        max_tokens=5,
        temperature=0.0,
    )
-    completion = completion.choices[0].text
+    assert len(completion.choices[0].text) >= 0
-    assert completion is not None and len(completion) >= 0
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_chat_session(server, client: openai.AsyncOpenAI,
-                                   model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert chat_completion.choices is not None and len(
-        chat_completion.choices) == 1
-    assert chat_completion.choices[0].message is not None
-    assert chat_completion.choices[0].logprobs is not None
-    assert chat_completion.choices[0].logprobs.content[
-        0].top_logprobs is not None
-    assert len(
-        chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
 @pytest.mark.asyncio
@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.logprobs is not None
    assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) <= 1
+    assert len(choice.logprobs.content[0].top_logprobs) == 0
 @pytest.mark.asyncio
@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.logprobs is not None
    assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) <= 6
+    assert len(choice.logprobs.content[0].top_logprobs) == 5
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
 async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
                                      model_name: str):
    messages = [{
@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=37, total_tokens=47)
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
    "model_name",
    [MODEL_NAME, "zephyr-lora"],
 )
@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
        temperature=0.0,
    )
    single_output = single_completion.choices[0].text
-    single_usage = single_completion.usage
    stream = await client.completions.create(model=model_name,
                                             prompt=prompt,
                                             max_tokens=5,
@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == "length"
    assert chunk.choices[0].text
-    assert chunk.usage == single_usage
    assert "".join(chunks) == single_output
@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
    assert "".join(chunks) == output
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(server,
+                                              client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None})
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True})
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    # just test 1 lora hereafter
@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
        logit_bias={str(token_id): 100},
        seed=42,
    )
-    assert completion.choices[0].text is not None and len(
+    assert len(completion.choices[0].text) >= 5
-        completion.choices[0].text) >= 5
    response_tokens = tokenizer(completion.choices[0].text,
                                add_special_tokens=False)["input_ids"]
    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
                        guided_decoding_backend=guided_decoding_backend))
    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 3
+    assert len(completion.choices) == 3
    for i in range(3):
-        assert completion.choices[i].text is not None
        output_json = json.loads(completion.choices[i].text)
        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
                        guided_decoding_backend=guided_decoding_backend))
    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 3
+    assert len(completion.choices) == 3
    for i in range(3):
-        assert completion.choices[i].text is not None
        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
                        guided_decoding_backend=guided_decoding_backend))
    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 2
+    assert len(completion.choices) == 2
    for i in range(2):
        assert completion.choices[i].text in TEST_CHOICE
@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
        top_logprobs=5,
        extra_body=dict(guided_choice=TEST_CHOICE,
                        guided_decoding_backend=guided_decoding_backend))
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content is not None
    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
    # -9999.0 is the minimum logprob returned by OpenAI
-    assert all(
+    for item in top_logprobs:
-        isinstance(token.logprob, float) and token.logprob >= -9999.0
+        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
-        for token in top_logprobs)
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_named_tool_use(server, client: openai.AsyncOpenAI,
+                              guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+    # non-streaming
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": TEST_SCHEMA
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        })
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    # streaming
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": TEST_SCHEMA
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+        stream=True)
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_required_tool_use_not_yet_supported(
+        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice="required")
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice="auto")
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_inconsistent_tool_choice_and_tools(
+        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=MODEL_NAME,
+                                             messages=messages,
+                                             max_tokens=1000,
+                                             tool_choice={
+                                                 "type": "function",
+                                                 "function": {
+                                                     "name":
+                                                     "dummy_function_name"
+                                                 }
+                                             })
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice={
+                "type": "function",
+                "function": {
+                    "name": "nondefined_function_name"
+                }
+            })
 @pytest.mark.asyncio
@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
            response_format={"type": "json_object"})
        content = resp.choices[0].message.content
+        assert content is not None
        loaded = json.loads(content)
        assert loaded == {"result": 2}, loaded
@@ -1032,8 +1350,9 @@ number: "1" | "2"
    "model_name",
    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
 async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
-                                       model_name: str):
+                                       model_name: str, logprobs_arg: int):
    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
    # test using text and token IDs
    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
                                                     max_tokens=5,
                                                     temperature=0.0,
                                                     echo=True,
-                                                     logprobs=1)
+                                                     logprobs=logprobs_arg)
        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
                                                             list) else prompt
-        assert (completion.choices[0].text is not None
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-                and re.search(r"^" + prompt_text, completion.choices[0].text))
        logprobs = completion.choices[0].logprobs
        assert logprobs is not None
        assert len(logprobs.text_offset) > 5
@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
                and logprobs.token_logprobs[0] is None)
        assert (len(logprobs.top_logprobs) > 5
                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
        assert len(logprobs.tokens) > 5
@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
 )
 async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
                                model_name: str):
-    input = [
+    input_texts = [
        "The chef prepared a delicious meal.",
    ]
    # test single embedding
    embeddings = await client.embeddings.create(
        model=model_name,
-        input=input,
+        input=input_texts,
        encoding_format="float",
    )
    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 9
    assert embeddings.usage.total_tokens == 9
    # test using token IDs
-    input = [1, 1, 1, 1, 1]
+    input_tokens = [1, 1, 1, 1, 1]
    embeddings = await client.embeddings.create(
        model=model_name,
-        input=input,
+        input=input_tokens,
        encoding_format="float",
    )
    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 1
+    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
 async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
                               model_name: str):
    # test List[str]
-    inputs = [
+    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
    ]
    embeddings = await client.embeddings.create(
        model=model_name,
-        input=inputs,
+        input=input_texts,
        encoding_format="float",
    )
    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 3
+    assert len(embeddings.data) == 3
    assert len(embeddings.data[0].embedding) == 4096
    # test List[List[int]]
-    inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-              [25, 32, 64, 77]]
+                    [25, 32, 64, 77]]
    embeddings = await client.embeddings.create(
        model=model_name,
-        input=inputs,
+        input=input_tokens,
        encoding_format="float",
    )
    assert embeddings.id is not None
-    assert embeddings.data is not None and len(embeddings.data) == 4
+    assert len(embeddings.data) == 4
    assert len(embeddings.data[0].embedding) == 4096
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17

--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
+from pathlib import Path
+from typing import Dict
+import openai
+import pytest
+import pytest_asyncio
+import ray
+from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
+from ..utils import ServerRunner
+MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
+                       "examples/template_llava.jinja")
+assert LLAVA_CHAT_TEMPLATE.exists()
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+pytestmark = pytest.mark.openai
+@pytest.fixture(scope="module")
+def server():
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--image-input-type",
+        "pixel_values",
+        "--image-token-id",
+        "32000",
+        "--image-input-shape",
+        "1,3,336,336",
+        "--image-feature-size",
+        "576",
+        "--chat-template",
+        str(LLAVA_CHAT_TEMPLATE),
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+@pytest_asyncio.fixture(scope="session")
+async def base64_encoded_image() -> Dict[str, str]:
+    return {
+        image_url:
+        encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        base64_encoded_image: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+                                    model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_multi_image_input(server, client: openai.AsyncOpenAI,
+                                 model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    with pytest.raises(openai.BadRequestError):  # test multi-image input
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+        )
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion = completion.choices[0].text
+    assert completion is not None and len(completion) >= 0
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -44,7 +44,7 @@ def test_act_and_mul(
    elif activation == "gelu_tanh":
        layer = GeluAndMul(approximate="tanh")
    out = layer(x)
-    ref_out = layer._forward(x)
+    ref_out = layer.forward_native(x)
    # The SiLU and GELU implementations are equivalent to the native PyTorch
    # implementations, so we can do exact comparison.
    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
@@ -72,7 +72,7 @@ def test_activation(
    x = torch.randn(num_tokens, d, dtype=dtype)
    layer = activation()
    out = layer(x)
-    ref_out = layer._forward(x)
+    ref_out = layer.forward_native(x)
    assert torch.allclose(out,
                          ref_out,
                          atol=get_default_atol(out),

--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
-import os
 from unittest.mock import patch
 import pytest
 import torch
+from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL,
+                                 override_backend_env_variable)
 from vllm.attention.selector import which_attn_to_use
 @pytest.mark.parametrize(
    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
-def test_env(name: str, device: str):
+def test_env(name: str, device: str, monkeypatch):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.
    """
-    name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
-    os.environ["VLLM_ATTENTION_BACKEND"] = name
+    override_backend_env_variable(monkeypatch, name)
    if device == "cpu":
        with patch("vllm.attention.selector.is_cpu", return_value=True):
@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
                                    torch.float16, 16)
        assert backend.name == name
-    if name_backup is not None:
-        os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
-def test_flash_attn():
+def test_flash_attn(monkeypatch):
    """Test FlashAttn validation."""
-    name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
-    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN"
+    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
    # Unsupported CUDA arch
    with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
@@ -71,14 +69,9 @@ def test_flash_attn():
    backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
    assert backend.name != "FLASH_ATTN"
-    if name_backup is not None:
-        os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
-def test_invalid_env():
+def test_invalid_env(monkeypatch):
    """Throw an exception if the backend name is invalid."""
-    name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
+    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    os.environ["VLLM_ATTENTION_BACKEND"] = "INVALID"
    with pytest.raises(ValueError):
        which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
-    os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
    assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module):
                                        self.out_dtype)
-def test_cutlass_cuda_graph():
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
    m, n, k = 512, 512, 512
    a = to_int8(torch.randn((m, k), device="cuda"))
    b = to_int8(torch.randn((n, k), device="cuda").t())
-    scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32) / 10)
+    m_a_scales = m if per_act_token else 1
-    scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32) / 10)
+    n_b_scales = n if per_out_ch else 1
+    scale_a = (torch.randn(
+        (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
+    scale_b = (torch.randn(
+        (1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
    # Construct a trivial model with a single layer that calls a CUTLASS kernel
    model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)

--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
 import pytest
 import torch
-from vllm._C import ops
+# ruff: noqa: F401
+import vllm._C
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
+                8193]  # Arbitrary values for testing
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SEEDS = [0]
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                   dtype: torch.dtype, seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    int8_traits = torch.iinfo(torch.int8)
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+    x_token_max, _ = x.max(dim=1)
+    x_token_max = x_token_max.to(dtype=torch.float32)
+    scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
+                                                      dtype=torch.float32)
+    torch_out = (x / scales).round().clamp(int8_traits.min,
+                                           int8_traits.max).to(torch.int8)
+    ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
+    scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
+    torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
+    assert torch.allclose(scales_out, scales)
+    assert torch.allclose(torch_out, ops_out,
+                          atol=1)  # big atol to account for rounding errors
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("scale", SCALE)
 @torch.inference_mode()
-def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
+def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
-               seed: int, scale: float) -> None:
+                                  dtype: torch.dtype, seed: int,
+                                  scale: float) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
+    int8_traits = torch.iinfo(torch.int8)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
-    out1 = (x / scale).round().clamp(
+    out1 = (x / scale).round().clamp(int8_traits.min,
-        torch.iinfo(torch.int8).min,
+                                     int8_traits.max).to(torch.int8)
-        torch.iinfo(torch.int8).max).to(torch.int8)
    out2 = torch.empty_like(x, dtype=torch.int8)
-    ops.static_scaled_int8_quant(out2, x, scale)
+    scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
+    torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
    assert torch.allclose(out1, out2,
                          atol=1)  # big atol to account for rounding errors
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -42,7 +42,7 @@ def test_rms_norm(
    # NOTE(woosuk): The reference implementation should be executed first
    # because the custom kernel is in-place.
-    ref_out = layer._forward(x, residual)
+    ref_out = layer.forward_native(x, residual)
    out = layer(x, residual)
    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
    # numerical errors than other operators because they involve reductions.

--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -64,7 +64,7 @@ def test_rotary_embedding(
    # NOTE(woosuk): The reference implementation should be executed first
    # because the custom kernel is in-place.
-    ref_query, ref_key = rope._forward(positions, query, key)
+    ref_query, ref_key = rope.forward_native(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
    assert torch.allclose(out_query,
@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
    # NOTE(woosuk): The reference implementation should be executed first
    # because the custom kernel is in-place.
-    ref_query, ref_key = rope._forward(positions, query, key)
+    ref_query, ref_key = rope.forward_native(positions, query, key)
    out_query, out_key = rope.forward(positions,
                                      query,
                                      key,
@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
    # NOTE(woosuk): The reference implementation should be executed first
    # because the custom kernel is in-place.
-    ref_query, ref_key = rope._forward(positions, query, key, query_offsets)
+    ref_query, ref_key = rope.forward_native(positions, query, key,
+                                             query_offsets)
    out_query, out_key = rope.forward(positions, query, key,
                                      query_offsets.flatten())
    # Compare the results.

--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
+"""Kernel test utils"""
+import pytest
+STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
+STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
+STR_INVALID_VAL: str = "INVALID"
+def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
+                                  backend_name: str) -> None:
+    '''
+    Override the environment variable indicating the vLLM backend temporarily,
+    using pytest monkeypatch to ensure that the env vars get
+    reset once the test context exits.
+    Arguments:
+    * mpatch: pytest monkeypatch instance
+    * backend_name: attention backend name to force
+    '''
+    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -42,10 +42,24 @@ def cleanup():
    ray.shutdown()
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+    if request.node.get_closest_marker("skip_global_cleanup"):
+        return False
+    return True
 @pytest.fixture(autouse=True)
-def cleanup_fixture():
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
-    cleanup()
+    if should_do_global_cleanup_after_test:
+        cleanup()
 @pytest.fixture

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -2,6 +2,7 @@ import random
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
+from unittest.mock import patch
 import pytest
 import torch
@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
 from .utils import DummyLoRAManager
@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
        logits_processor = LogitsProcessor(
            vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
        lora_logits_processor = LogitsProcessorWithLoRA(
-            logits_processor, 1024, linear.weight.dtype, linear.weight.device)
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
+            None)
        lora_logits_processor.create_lora_weights(max_loras, lora_config)
        return linear, logits_processor, lora_logits_processor
@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
    torch.allclose(ref_q, actual_q)
    torch.allclose(ref_k, actual_k)
+@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
+@pytest.mark.parametrize("seed", list(range(256)))
+def test_vocab_parallel_embedding_indices(tp_size, seed):
+    random.seed(seed)
+    vocab_size = random.randint(4000, 64000)
+    added_vocab_size = random.randint(0, 1024)
+    org_vocab_size = vocab_size - added_vocab_size
+    last_org_vocab_end_index = 0
+    last_added_vocab_end_index = org_vocab_size
+    computed_vocab_size = 0
+    computed_org_vocab_size = 0
+    computed_added_vocab_size = 0
+    vocab_size_padded = -1
+    all_org_tokens = []
+    all_added_tokens = []
+    token_ids = []
+    for tp_rank in range(tp_size):
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
+                return_value=tp_rank
+        ), patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
+                return_value=tp_size):
+            vocab_embedding = VocabParallelEmbedding(
+                vocab_size, 1, org_num_embeddings=org_vocab_size)
+        vocab_size_padded = vocab_embedding.num_embeddings_padded
+        shard_indices = vocab_embedding.shard_indices
+        # Assert that the ranges are contiguous
+        assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
+        assert (shard_indices.added_vocab_start_index ==
+                last_added_vocab_end_index)
+        # Ensure that we are not exceeding the vocab size
+        computed_vocab_size += shard_indices.num_elements_padded
+        computed_org_vocab_size += shard_indices.num_org_elements
+        computed_added_vocab_size += shard_indices.num_added_elements
+        # Ensure that the ranges are not overlapping
+        all_org_tokens.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        all_added_tokens.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+        token_ids.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
+                                 shard_indices.num_org_elements))
+        token_ids.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
+                                 shard_indices.num_added_elements))
+        last_org_vocab_end_index = shard_indices.org_vocab_end_index
+        last_added_vocab_end_index = shard_indices.added_vocab_end_index
+    assert computed_vocab_size == vocab_size_padded
+    assert computed_org_vocab_size == org_vocab_size
+    assert computed_added_vocab_size == added_vocab_size
+    # Ensure that the ranges are not overlapping
+    assert len(all_org_tokens) == len(set(all_org_tokens))
+    assert len(all_added_tokens) == len(set(all_added_tokens))
+    assert not set(all_org_tokens).intersection(set(all_added_tokens))
+    token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
+    reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
+    assert reindex_mapping is not None or tp_size == 1
+    if reindex_mapping is not None:
+        reindexed_token_ids = token_ids_tensor[reindex_mapping]
+        expected = torch.tensor(list(range(0, vocab_size)))
+        assert reindexed_token_ids[:vocab_size].equal(expected)
+        assert torch.all(reindexed_token_ids[vocab_size:] == -1)
+def test_get_masked_input_and_mask():
+    x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    # base tp 1 case, no padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=0)
+    assert torch.equal(x, modified_x)
+    # tp 2 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
+    # tp 4 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=0)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
+    # base tp 1 case, with padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=2)
+    assert torch.equal(modified_x,
+                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
+    # tp 2 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
+    # tp 4 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=2)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
    return generated_texts
-@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("tp_size", [1, 2, 4])
-def test_llama_lora(sql_lora_files, tp_size):
+def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
-    # Cannot use as it will initialize torch.cuda too early...
+    if num_gpus_available < tp_size:
-    # if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    llm = vllm.LLM(MODEL_PATH,
                   enable_lora=True,
@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
    print("removing lora")
-@pytest.mark.skip("Requires multiple GPUs")
+def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
-def test_llama_tensor_parallel_equality(sql_lora_files):
+    if num_gpus_available < 4:
-    # Cannot use as it will initialize torch.cuda too early...
+        pytest.skip("Not enough GPUs for tensor parallelism 4")
-    # if torch.cuda.device_count() < 4:
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
    llm_tp1 = vllm.LLM(MODEL_PATH,
                       enable_lora=True,