Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

6d2051cc · zhuwenwen · 2c7f740a · a2c71c54 · 6d2051cc · 6d2051cc
Commit 6d2051cc authored Oct 21, 2024 by zhuwenwen
20 changed files
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -131,19 +131,22 @@ def create_seq_group_metadata_from_prompts(
        for i, final_len in enumerate(final_prompt_lens)
    }

-    return [
-        SequenceGroupMetadata(
-            request_id=str(i),
-            is_prompt=len(cont_token_ids) == 0,
-            seq_data={
-                i: SequenceData.from_seqs(prompt_token_ids[:],
-                                          cont_token_ids[:]),
-            },
-            sampling_params=SamplingParams(temperature=0.0, ),
-            block_tables={i: block_allocations[i][:]},
-        ) for i, (prompt_token_ids,
-                  cont_token_ids) in enumerate(zip(prompts, continuations))
-    ]
+    seq_grou_metadata_list = []
+    for i, (prompt_token_ids,
+            cont_token_ids) in enumerate(zip(prompts, continuations)):
+        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
+        data.update_num_computed_tokens(
+            len(prompt_token_ids) + len(cont_token_ids) - 1)
+        seq_data = {i: data}
+        seq_grou_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(i),
+                is_prompt=len(cont_token_ids) == 0,
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations[i][:]},
+            ))
+    return seq_grou_metadata_list


 def assert_logprobs_dict_allclose(

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -64,9 +64,9 @@ def test_get_sliding_window():


 def test_rope_customization():
-    TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
+    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
    TEST_ROPE_THETA = 16_000_000.0
-    LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
+    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}

    llama_model_config = ModelConfig(
        "meta-llama/Meta-Llama-3-8B-Instruct",

--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -2,6 +2,7 @@ from typing import List

 import pytest

+from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_and_batch_prompt

 STRING_INPUTS = [
@@ -51,3 +52,28 @@ def test_parse_single_batch_token_consistent(token_input: List[int]):
 def test_parse_single_batch_string_slice(inputs_slice: slice):
    assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
        == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
+
+
+# yapf: disable
+@pytest.mark.parametrize('mm_processor_kwargs,expected_mm_kwargs', [
+    (None, [{}, {}]),
+    ({}, [{}, {}]),
+    ({"foo": 100}, [{"foo": 100}, {"foo": 100}]),
+    ([{"foo": 100}, {"bar": 200}], [{"foo": 100}, {"bar": 200}]),
+])
+# yapf: enable
+def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
+    """Test mm_processor_kwargs init for zipping enc/dec prompts."""
+    encoder_prompts = ['An encoder prompt', 'Another encoder prompt']
+    decoder_prompts = ['A decoder prompt', 'Another decoder prompt']
+    zipped_prompts = zip_enc_dec_prompts(encoder_prompts, decoder_prompts,
+                                         mm_processor_kwargs)
+    assert len(zipped_prompts) == len(encoder_prompts) == len(decoder_prompts)
+    for enc, dec, exp_kwargs, zipped in zip(encoder_prompts, decoder_prompts,
+                                            expected_mm_kwargs,
+                                            zipped_prompts):
+        assert isinstance(zipped, dict)
+        assert len(zipped.keys()) == 3
+        assert zipped['encoder_prompt'] == enc
+        assert zipped['decoder_prompt'] == dec
+        assert zipped['mm_processor_kwargs'] == exp_kwargs
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,7 @@ from typing import AsyncIterator, Tuple
 import pytest

 from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
-                        get_open_port, merge_async_iterators)
+                        get_open_port, merge_async_iterators, supports_kw)

 from .utils import error_on_warning

@@ -136,6 +136,8 @@ def parser():
 def parser_with_config():
    parser = FlexibleArgumentParser()
    parser.add_argument('serve')
+    parser.add_argument('model_tag')
+    parser.add_argument('--served-model-name', type=str)
    parser.add_argument('--config', type=str)
    parser.add_argument('--port', type=int)
    parser.add_argument('--tensor-parallel-size', type=int)
@@ -190,33 +192,77 @@ def test_missing_required_argument(parser):

 def test_cli_override_to_config(parser_with_config):
    args = parser_with_config.parse_args([
-        'serve', '--config', './data/test_config.yaml',
+        'serve', 'mymodel', '--config', './data/test_config.yaml',
        '--tensor-parallel-size', '3'
    ])
    assert args.tensor_parallel_size == 3
    args = parser_with_config.parse_args([
-        'serve', '--tensor-parallel-size', '3', '--config',
+        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
        './data/test_config.yaml'
    ])
    assert args.tensor_parallel_size == 3
+    assert args.port == 12312
+    args = parser_with_config.parse_args([
+        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
+        './data/test_config.yaml', '--port', '666'
+    ])
+    assert args.tensor_parallel_size == 3
+    assert args.port == 666


 def test_config_args(parser_with_config):
    args = parser_with_config.parse_args(
-        ['serve', '--config', './data/test_config.yaml'])
+        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
    assert args.tensor_parallel_size == 2


 def test_config_file(parser_with_config):
    with pytest.raises(FileNotFoundError):
-        parser_with_config.parse_args(['serve', '--config', 'test_config.yml'])
+        parser_with_config.parse_args(
+            ['serve', 'mymodel', '--config', 'test_config.yml'])

    with pytest.raises(ValueError):
        parser_with_config.parse_args(
-            ['serve', '--config', './data/test_config.json'])
+            ['serve', 'mymodel', '--config', './data/test_config.json'])

    with pytest.raises(ValueError):
        parser_with_config.parse_args([
-            'serve', '--tensor-parallel-size', '3', '--config', '--batch-size',
-            '32'
+            'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
+            '--batch-size', '32'
        ])
+
+
+def test_no_model_tag(parser_with_config):
+    with pytest.raises(ValueError):
+        parser_with_config.parse_args(
+            ['serve', '--config', './data/test_config.yaml'])
+
+
+# yapf: enable
+@pytest.mark.parametrize(
+    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
+    [
+        # Tests for positional argument support
+        (lambda foo: None, "foo", True, True, False),
+        (lambda foo: None, "foo", False, True, True),
+        # Tests for positional or keyword / keyword only
+        (lambda foo=100: None, "foo", True, True, False),
+        (lambda *, foo: None, "foo", False, True, True),
+        # Tests to make sure the names of variadic params are NOT supported
+        (lambda *args: None, "args", False, True, False),
+        (lambda **kwargs: None, "kwargs", False, True, False),
+        # Tests for if we allow var kwargs to add support
+        (lambda foo: None, "something_else", False, True, False),
+        (lambda foo, **kwargs: None, "something_else", False, True, True),
+        (lambda foo, **kwargs: None, "kwargs", True, True, False),
+        (lambda foo, **kwargs: None, "foo", True, True, False),
+    ])
+# yapf: disable
+def test_supports_kw(callable,kw_name,requires_kw_only,
+                     allow_var_kwargs,is_supported):
+    assert supports_kw(
+        callable=callable,
+        kw_name=kw_name,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs
+    ) == is_supported
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
+import pytest
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+
+def test_chat_completion_request_with_no_tools():
+    # tools key is not present
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        'facebook/opt-125m',
+    })
+    assert request.tool_choice == 'none'
+
+    # tools key is None
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        'facebook/opt-125m',
+        'tools':
+        None
+    })
+    assert request.tool_choice == 'none'
+
+    # tools key present but empty
+    request = ChatCompletionRequest.model_validate({
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello'
+        }],
+        'model':
+        'facebook/opt-125m',
+        'tools': []
+    })
+    assert request.tool_choice == 'none'
+
+
+def test_chat_completion_request_with_tool_choice_but_no_tools():
+    with pytest.raises(ValueError,
+                       match="When using `tool_choice`, `tools` must be set."):
+        ChatCompletionRequest.model_validate({
+            'messages': [{
+                'role': 'user',
+                'content': 'Hello'
+            }],
+            'model':
+            'facebook/opt-125m',
+            'tool_choice':
+            'auto'
+        })
+
+    with pytest.raises(ValueError,
+                       match="When using `tool_choice`, `tools` must be set."):
+        ChatCompletionRequest.model_validate({
+            'messages': [{
+                'role': 'user',
+                'content': 'Hello'
+            }],
+            'model':
+            'facebook/opt-125m',
+            'tool_choice':
+            'auto',
+            'tools':
+            None
+        })
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -3,18 +3,20 @@ from typing import List
 import openai
 import pytest

-from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL
+from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
+                    ensure_system_prompt)


 # test: make sure chat completions without tools provided work even when tools
 # are enabled. This makes sure tool call chat templates work, AND that the tool
 # parser stream processing doesn't change the output of the model.
 @pytest.mark.asyncio
-async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
+async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
+                                             server_config: ServerConfig):
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
        temperature=0,
        max_tokens=150,
        model=model_name,
@@ -34,7 +36,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):

    # make the same request, streaming
    stream = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
        temperature=0,
        max_tokens=150,
        model=model_name,
@@ -77,11 +79,12 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
 # tools, to make sure we can still get normal chat completion responses
 # and that they won't be parsed as tools
 @pytest.mark.asyncio
-async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):
+async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
+                                          server_config: ServerConfig):
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
        temperature=0,
        max_tokens=150,
        model=model_name,
@@ -102,7 +105,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):

    # make the same request, streaming
    stream = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
        temperature=0,
        max_tokens=150,
        model=model_name,

--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -6,7 +6,7 @@ import pytest

 from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
                    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
-                    WEATHER_TOOL)
+                    WEATHER_TOOL, ServerConfig)


 # test: getting the model to generate parallel tool calls (streaming/not)
@@ -14,7 +14,13 @@ from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
 # may be added in the future. e.g. llama 3.1 models are not designed to support
 # parallel tool calls.
 @pytest.mark.asyncio
-async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
+async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
+                                   server_config: ServerConfig):
+
+    if not server_config.get("supports_parallel", True):
+        pytest.skip("The {} model doesn't support parallel tool calls".format(
+            server_config["model"]))
+
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
@@ -39,7 +45,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
        assert tool_call.type == "function"
        assert tool_call.function is not None
        assert isinstance(tool_call.id, str)
-        assert len(tool_call.id) > 16
+        assert len(tool_call.id) >= 9

        # make sure the weather tool was called correctly
        assert tool_call.function.name == WEATHER_TOOL["function"]["name"]
@@ -102,7 +108,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
            if tool_call.id:
                tool_call_id_count += 1
                assert (isinstance(tool_call.id, str)
-                        and (len(tool_call.id) > 16))
+                        and (len(tool_call.id) >= 9))

            # if parts of the function start being streamed
            if tool_call.function:
@@ -136,7 +142,13 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
 # test: providing parallel tool calls back to the model to get a response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI):
+async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
+                                                server_config: ServerConfig):
+
+    if not server_config.get("supports_parallel", True):
+        pytest.skip("The {} model doesn't support parallel tool calls".format(
+            server_config["model"]))
+
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(

--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -33,7 +33,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
    assert tool_calls[0].type == 'function'
    assert tool_calls[0].function is not None
    assert isinstance(tool_calls[0].id, str)
-    assert len(tool_calls[0].id) > 16
+    assert len(tool_calls[0].id) >= 9

    # make sure the weather tool was called (classic example) with arguments
    assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"]
@@ -106,7 +106,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):

    assert finish_reason_count == 1
    assert role_name == 'assistant'
-    assert isinstance(tool_call_id, str) and (len(tool_call_id) > 16)
+    assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)

    # validate the name and arguments
    assert function_name == WEATHER_TOOL["function"]["name"]

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
-from typing import Dict, List
+from copy import deepcopy
+from typing import Any, Dict, List, Optional

 from openai.types.chat import (ChatCompletionMessageParam,
                               ChatCompletionToolParam)
@@ -7,9 +8,30 @@ from typing_extensions import TypedDict
 from tests.utils import VLLM_PATH


-class ServerConfig(TypedDict):
+class ServerConfig(TypedDict, total=False):
    model: str
    arguments: List[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+
+
+def patch_system_prompt(messages: List[Dict[str, Any]],
+                        system_prompt: str) -> List[Dict[str, Any]]:
+    new_messages = deepcopy(messages)
+    if new_messages[0]["role"] == "system":
+        new_messages[0]["content"] = system_prompt
+    else:
+        new_messages.insert(0, {"role": "system", "content": system_prompt})
+    return new_messages
+
+
+def ensure_system_prompt(messages: List[Dict[str, Any]],
+                         config: ServerConfig) -> List[Dict[str, Any]]:
+    prompt = config.get("system_prompt")
+    if prompt:
+        return patch_system_prompt(messages, prompt)
+    else:
+        return messages


 # universal args for all models go here. also good if you need to test locally
@@ -23,7 +45,33 @@ CONFIGS: Dict[str, ServerConfig] = {
        "arguments": [
            "--tool-call-parser", "hermes", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
-        ]
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+    "llama": {
+        "model":
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "arguments": [
+            "--tool-call-parser", "llama3_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
+        ],
+        "supports_parallel":
+        False,
+    },
+    "llama3.2": {
+        "model":
+        "meta-llama/Llama-3.2-3B-Instruct",
+        "arguments": [
+            "--tool-call-parser", "llama3_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
+        ],
+        "supports_parallel":
+        False,
    },
    "mistral": {
        "model":
@@ -32,7 +80,25 @@ CONFIGS: Dict[str, ServerConfig] = {
            "--tool-call-parser", "mistral", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
            "--ignore-patterns=\"consolidated.safetensors\""
-        ]
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+    "internlm": {
+        "model":
+        "internlm/internlm2_5-7b-chat",
+        "arguments": [
+            "--tool-call-parser", "internlm", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_internlm2_tool.jinja"),
+            "--trust_remote_code"
+        ],
+        "supports_parallel":
+        False,
    }
 }

@@ -55,7 +121,7 @@ WEATHER_TOOL: ChatCompletionToolParam = {
                    "type":
                    "string",
                    "description":
-                    "the two-letter abbreviation for the state "
+                    "must the two-letter abbreviation for the state "
                    "that the city is in, e.g. 'CA' which would "
                    "mean 'California'"
                },
@@ -97,15 +163,6 @@ SEARCH_TOOL: ChatCompletionToolParam = {
 }

 MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{
-    "role":
-    "system",
-    "content":
-    "You are a helpful assistant with access to tools. If a tool"
-    " that you have would be helpful to answer a user query, "
-    "call the tool. Otherwise, answer the user's query directly "
-    "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-    "to the user's question - just respond to it normally."
-}, {
    "role":
    "user",
    "content":

--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,9 +5,11 @@ import tempfile

 import depyf

+from vllm.compilation.levels import CompilationLevel
+
 # disable custom dispatcher, let Dynamo takes over
 # all the control
-os.environ['VLLM_DYNAMO_USE_CUSTOM_DISPATCHER'] = "0"
+os.environ['VLLM_TORCH_COMPILE_LEVEL'] = str(CompilationLevel.DYNAMO_AS_IS)

 temp_dir = tempfile.mkdtemp()
 with depyf.prepare_debug(temp_dir):

--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
 import os

+from vllm.compilation.levels import CompilationLevel
+
 from ..utils import compare_two_settings

 # --enforce-eager on TPU causes graph compilation
@@ -9,8 +11,9 @@ os.environ["VLLM_RPC_TIMEOUT"] = "30000"


 def test_custom_dispatcher():
-    compare_two_settings("google/gemma-2b",
-                         arg1=["--enforce-eager"],
-                         arg2=["--enforce-eager"],
-                         env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"},
-                         env2={})
+    compare_two_settings(
+        "google/gemma-2b",
+        arg1=["--enforce-eager"],
+        arg2=["--enforce-eager"],
+        env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)},
+        env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)})
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -98,8 +98,6 @@ def test_traces(trace_service):
        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
    assert attributes.get(
        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
        outputs[0].prompt_token_ids)
@@ -155,8 +153,6 @@ def test_traces_with_detailed_steps(trace_service):
        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
    assert attributes.get(
        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
        outputs[0].prompt_token_ids)

--- a/tests/utils.py
+++ b/tests/utils.py
+import asyncio
 import functools
 import os
 import signal
@@ -7,15 +8,15 @@ import time
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional, Union

 import openai
 import pytest
 import requests
 from openai.types.completion import Completion
-from transformers import AutoTokenizer
-from typing_extensions import ParamSpec
+from typing_extensions import ParamSpec, assert_never

+import vllm.envs as envs
 from tests.models.utils import TextTextLogprobs
 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
@@ -23,8 +24,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
-                        get_open_port, is_hip)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import (FlexibleArgumentParser, GB_bytes,
+                        cuda_device_count_stateless, get_open_port, is_hip)

 if current_platform.is_rocm():
    from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -162,11 +164,140 @@ class RemoteOpenAIServer:
        )


+def _test_completion(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+    token_ids: List[int],
+):
+    results = []
+
+    # test with text prompt
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=5,
+                                           temperature=0.0)
+
+    results.append({
+        "test": "single_completion",
+        "text": completion.choices[0].text,
+        "finish_reason": completion.choices[0].finish_reason,
+        "usage": completion.usage,
+    })
+
+    # test using token IDs
+    completion = client.completions.create(
+        model=model,
+        prompt=token_ids,
+        max_tokens=5,
+        temperature=0.0,
+    )
+
+    results.append({
+        "test": "token_ids",
+        "text": completion.choices[0].text,
+        "finish_reason": completion.choices[0].finish_reason,
+        "usage": completion.usage,
+    })
+
+    # test seeded random sampling
+    completion = client.completions.create(model=model,
+                                           prompt=prompt,
+                                           max_tokens=5,
+                                           seed=33,
+                                           temperature=1.0)
+
+    results.append({
+        "test": "seeded_sampling",
+        "text": completion.choices[0].text,
+        "finish_reason": completion.choices[0].finish_reason,
+        "usage": completion.usage,
+    })
+
+    # test seeded random sampling with multiple prompts
+    completion = client.completions.create(model=model,
+                                           prompt=[prompt, prompt],
+                                           max_tokens=5,
+                                           seed=33,
+                                           temperature=1.0)
+
+    results.append({
+        "test":
+        "seeded_sampling",
+        "text": [choice.text for choice in completion.choices],
+        "finish_reason":
+        [choice.finish_reason for choice in completion.choices],
+        "usage":
+        completion.usage,
+    })
+
+    # test simple list
+    batch = client.completions.create(
+        model=model,
+        prompt=[prompt, prompt],
+        max_tokens=5,
+        temperature=0.0,
+    )
+
+    results.append({
+        "test": "simple_list",
+        "text0": batch.choices[0].text,
+        "text1": batch.choices[1].text,
+    })
+
+    # test streaming
+    batch = client.completions.create(
+        model=model,
+        prompt=[prompt, prompt],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+
+    texts = [""] * 2
+    for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+
+    results.append({
+        "test": "streaming",
+        "texts": texts,
+    })
+
+    return results
+
+
+def _test_embeddings(
+    client: openai.OpenAI,
+    model: str,
+    text: str,
+):
+    results = []
+
+    # test with text input
+    embeddings = client.embeddings.create(
+        model=model,
+        input=text,
+        encoding_format="float",
+    )
+
+    results.append({
+        "test": "single_embedding",
+        "embedding": embeddings.data[0].embedding,
+        "usage": embeddings.usage,
+    })
+
+    return results
+
+
 def compare_two_settings(model: str,
                         arg1: List[str],
                         arg2: List[str],
                         env1: Optional[Dict[str, str]] = None,
                         env2: Optional[Dict[str, str]] = None,
+                         *,
+                         method: Literal["generate", "encode"] = "generate",
                         max_wait_seconds: Optional[float] = None) -> None:
    """
    Launch API server with two different sets of arguments/environments
@@ -180,17 +311,70 @@ def compare_two_settings(model: str,
        env2: The second set of environment variables to pass to the API server.
    """

-    trust_remote_code = "--trust-remote-code"
-    if trust_remote_code in arg1 or trust_remote_code in arg2:
-        tokenizer = AutoTokenizer.from_pretrained(model,
-                                                  trust_remote_code=True)
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(model)
+    compare_all_settings(
+        model,
+        [arg1, arg2],
+        [env1, env2],
+        method=method,
+        max_wait_seconds=max_wait_seconds,
+    )
+
+
+def compare_all_settings(model: str,
+                         all_args: List[List[str]],
+                         all_envs: List[Optional[Dict[str, str]]],
+                         *,
+                         method: Literal["generate", "encode"] = "generate",
+                         max_wait_seconds: Optional[float] = None) -> None:
+    """
+    Launch API server with several different sets of arguments/environments
+    and compare the results of the API calls with the first set of arguments.
+    Args:
+        model: The model to test.
+        all_args: A list of argument lists to pass to the API server.
+        all_envs: A list of environment dictionaries to pass to the API server.
+    """
+
+    trust_remote_code = False
+    for args in all_args:
+        if "--trust-remote-code" in args:
+            trust_remote_code = True
+            break
+
+    tokenizer_mode = "auto"
+    for args in all_args:
+        if "--tokenizer-mode" in args:
+            tokenizer_mode = args[args.index("--tokenizer-mode") + 1]
+            break
+
+    tokenizer = get_tokenizer(
+        model,
+        trust_remote_code=trust_remote_code,
+        tokenizer_mode=tokenizer_mode,
+    )
+
+    can_force_load_format = True
+
+    for args in all_args:
+        if "--load-format" in args:
+            can_force_load_format = False
+            break

    prompt = "Hello, my name is"
-    token_ids = tokenizer(prompt)["input_ids"]
-    results = []
-    for args, env in ((arg1, env1), (arg2, env2)):
+    token_ids = tokenizer(prompt).input_ids
+    ref_results: List = []
+    for i, (args, env) in enumerate(zip(all_args, all_envs)):
+        if can_force_load_format:
+            # we are comparing the results and
+            # usually we don't need real weights.
+            # we force to use dummy weights by default,
+            # and it should work for most of the cases.
+            # if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT
+            # environment variable to force the load format,
+            # e.g. in quantization tests.
+            args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
+        compare_results: List = []
+        results = ref_results if i == 0 else compare_results
        with RemoteOpenAIServer(model,
                                args,
                                env_dict=env,
@@ -207,104 +391,27 @@ def compare_two_settings(model: str,
                "root": served_model.root,
            })

-            # test with text prompt
-            completion = client.completions.create(model=model,
-                                                   prompt=prompt,
-                                                   max_tokens=5,
-                                                   temperature=0.0)
-
-            results.append({
-                "test": "single_completion",
-                "text": completion.choices[0].text,
-                "finish_reason": completion.choices[0].finish_reason,
-                "usage": completion.usage,
-            })
-
-            # test using token IDs
-            completion = client.completions.create(
-                model=model,
-                prompt=token_ids,
-                max_tokens=5,
-                temperature=0.0,
-            )
-
-            results.append({
-                "test": "token_ids",
-                "text": completion.choices[0].text,
-                "finish_reason": completion.choices[0].finish_reason,
-                "usage": completion.usage,
-            })
-
-            # test seeded random sampling
-            completion = client.completions.create(model=model,
-                                                   prompt=prompt,
-                                                   max_tokens=5,
-                                                   seed=33,
-                                                   temperature=1.0)
-
-            results.append({
-                "test": "seeded_sampling",
-                "text": completion.choices[0].text,
-                "finish_reason": completion.choices[0].finish_reason,
-                "usage": completion.usage,
-            })
-
-            # test seeded random sampling with multiple prompts
-            completion = client.completions.create(model=model,
-                                                   prompt=[prompt, prompt],
-                                                   max_tokens=5,
-                                                   seed=33,
-                                                   temperature=1.0)
-
-            results.append({
-                "test":
-                "seeded_sampling",
-                "text": [choice.text for choice in completion.choices],
-                "finish_reason":
-                [choice.finish_reason for choice in completion.choices],
-                "usage":
-                completion.usage,
-            })
-
-            # test simple list
-            batch = client.completions.create(
-                model=model,
-                prompt=[prompt, prompt],
-                max_tokens=5,
-                temperature=0.0,
-            )
-
-            results.append({
-                "test": "simple_list",
-                "text0": batch.choices[0].text,
-                "text1": batch.choices[1].text,
-            })
-
-            # test streaming
-            batch = client.completions.create(
-                model=model,
-                prompt=[prompt, prompt],
-                max_tokens=5,
-                temperature=0.0,
-                stream=True,
-            )
-            texts = [""] * 2
-            for chunk in batch:
-                assert len(chunk.choices) == 1
-                choice = chunk.choices[0]
-                texts[choice.index] += choice.text
-            results.append({
-                "test": "streaming",
-                "texts": texts,
-            })
-
-    n = len(results) // 2
-    arg1_results = results[:n]
-    arg2_results = results[n:]
-    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
-        assert arg1_result == arg2_result, (
-            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
-            f"{arg1_result=} != {arg2_result=}")
+            if method == "generate":
+                results += _test_completion(client, model, prompt, token_ids)
+            elif method == "encode":
+                results += _test_embeddings(client, model, prompt)
+            else:
+                assert_never(method)
+
+            if i > 0:
+                # if any setting fails, raise an error early
+                ref_args = all_args[0]
+                ref_envs = all_envs[0]
+                compare_args = all_args[i]
+                compare_envs = all_envs[i]
+                for ref_result, compare_result in zip(ref_results,
+                                                      compare_results):
+                    assert ref_result == compare_result, (
+                        f"Results for {model=} are not the same.\n"
+                        f"{ref_args=} {ref_envs=}\n"
+                        f"{compare_args=} {compare_envs=}\n"
+                        f"{ref_result=}\n"
+                        f"{compare_result=}\n")


 def init_test_distributed_environment(
@@ -454,6 +561,37 @@ def fork_new_process_for_each_test(
    return wrapper


+def large_gpu_test(*, min_gb: int):
+    """
+    Decorate a test to be skipped if no GPU is available or it does not have
+    sufficient memory.
+
+    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
+    """
+    try:
+        if current_platform.is_cpu():
+            memory_gb = 0
+        else:
+            memory_gb = current_platform.get_device_total_memory() / GB_bytes
+    except Exception as e:
+        warnings.warn(
+            f"An error occurred when finding the available memory: {e}",
+            stacklevel=2,
+        )
+
+        memory_gb = 0
+
+    test_skipif = pytest.mark.skipif(
+        memory_gb < min_gb,
+        reason=f"Need at least {memory_gb}GB GPU memory to run the test.",
+    )
+
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return test_skipif(fork_new_process_for_each_test(f))
+
+    return wrapper
+
+
 def multi_gpu_test(*, num_gpus: int):
    """
    Decorate a test to be run only when multiple GPUs are available.
@@ -476,7 +614,8 @@ async def completions_with_server_args(
    server_cli_args: List[str],
    num_logprobs: Optional[int],
    max_wait_seconds: int = 240,
-) -> Completion:
+    max_tokens: Union[int, list] = 5,
+) -> List[Completion]:
    '''Construct a remote OpenAI server, obtain an async client to the
    server & invoke the completions API to obtain completions.

@@ -487,37 +626,49 @@ async def completions_with_server_args(
      num_logprobs: Number of logprobs to report (or `None`)
      max_wait_seconds: timeout interval for bringing up server.
                        Default: 240sec
+      max_tokens: max_tokens value for each of the given input prompts.
+        if only one max_token value is given, the same value is used
+        for all the prompts.

    Returns:
      OpenAI Completion instance
    '''

+    if isinstance(max_tokens, int):
+        max_tokens = [max_tokens] * len(prompts)
+
+    assert len(max_tokens) == len(prompts)
+
    outputs = None
    max_wait_seconds = 240 * 3  # 240 is default
    with RemoteOpenAIServer(model_name,
                            server_cli_args,
                            max_wait_seconds=max_wait_seconds) as server:
        client = server.get_async_client()
-        outputs = await client.completions.create(model=model_name,
-                                                  prompt=prompts,
-                                                  temperature=0,
-                                                  stream=False,
-                                                  max_tokens=5,
-                                                  logprobs=num_logprobs)
+        outputs = [ client.completions.create(model=model_name,
+                                              prompt=[p],
+                                              temperature=0,
+                                              stream=False,
+                                              max_tokens=max_tok,
+                                              logprobs=num_logprobs) \
+                    for p, max_tok in zip(prompts, max_tokens) ]
+        outputs = await asyncio.gather(*outputs)
+
    assert outputs is not None, "Completion API call failed."

    return outputs


-def get_client_text_generations(completions: Completion) -> List[str]:
+def get_client_text_generations(completions: List[Completion]) -> List[str]:
    '''Extract generated tokens from the output of a
    request made to an Open-AI-protocol completions endpoint.
    '''
-    return [x.text for x in completions.choices]
+    assert all([len(x.choices) == 1 for x in completions])
+    return [x.choices[0].text for x in completions]


 def get_client_text_logprob_generations(
-        completions: Completion) -> List[TextTextLogprobs]:
+        completions: List[Completion]) -> List[TextTextLogprobs]:
    '''Operates on the output of a request made to an Open-AI-protocol
    completions endpoint; obtains top-rank logprobs for each token in
    each :class:`SequenceGroup`
@@ -526,4 +677,4 @@ def get_client_text_logprob_generations(
    text = ''.join(text_generations)
    return [(text_generations, text,
             (None if x.logprobs is None else x.logprobs.top_logprobs))
-            for x in completions.choices]
+            for completion in completions for x in completion.choices]
--- a/tests/weight_loading/models-large.txt
+++ b/tests/weight_loading/models-large.txt
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
-compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
+awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
\ No newline at end of file
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -20,6 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
+compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main

--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
 #!/bin/bash
 SUCCESS=0

-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "weight_loading/models.txt"
+while getopts "c:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG

 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do

--- a/tools/actionlint.sh
+++ b/tools/actionlint.sh
+#!/bin/bash
+
+if command -v actionlint &> /dev/null; then
+    actionlint "$@"
+    exit 0
+elif [ -x ./actionlint ]; then
+    ./actionlint "$@"
+    exit 0
+fi
+
+# download a binary to the current directory - v1.7.3
+bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
+./actionlint "$@"
--- a/tools/check_repo.sh
+++ b/tools/check_repo.sh
+#!/bin/bash
+# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
+
+if ! git diff --quiet; then
+	echo "Repo is dirty" >&2
+
+	exit 1
+fi
+
+if ! git describe --tags; then
+	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
+
+	exit 1
+fi
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
+#!/bin/bash
+
+CI=${1:-0}
+
+run_mypy() {
+    echo "Running mypy on $1"
+    if [ $CI -eq 1 ] && [ -z "$1" ]; then
+        mypy "$@"
+        return
+    fi
+    mypy --follow-imports skip "$@"
+}
+
+run_mypy # Note that this is less strict than CI
+run_mypy tests
+run_mypy vllm/attention
+run_mypy vllm/compilation
+run_mypy vllm/distributed
+run_mypy vllm/engine
+run_mypy vllm/executor
+run_mypy vllm/lora
+run_mypy vllm/model_executor
+run_mypy vllm/plugins
+run_mypy vllm/prompt_adapter
+run_mypy vllm/spec_decode
+run_mypy vllm/worker
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
+import argparse
+import json
+from typing import Dict
+
+from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
+from vllm.profiler.utils import TablePrinter, indent_string
+
+
+def flatten_entries(entry_cls, profile_dict: Dict):
+    entries_and_depth = []
+
+    def get_entries(node, curr_depth=0):
+        entries_and_depth.append((entry_cls(**node["entry"]), curr_depth))
+
+        for child in node["children"]:
+            get_entries(
+                child,
+                curr_depth=curr_depth + 1,
+            )
+
+    for root in profile_dict:
+        get_entries(root)
+
+    return entries_and_depth
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--json-trace",
+                        type=str,
+                        required=True,
+                        help="json trace file output by "
+                        "examples/offline_profile.py")
+    parser.add_argument("--phase",
+                        type=str,
+                        choices=["prefill", "decode_1"],
+                        required=True,
+                        help="The phase to print the table for.")
+    parser.add_argument("--table",
+                        type=str,
+                        choices=["summary", "model"],
+                        default="summary",
+                        help="Which table to print, the summary table or the "
+                        "layerwise model table")
+
+    args = parser.parse_args()
+
+    with open(args.json_trace, "r") as f:
+        profile_data = json.load(f)
+
+    if args.table == "summary":
+        entries_and_depths = flatten_entries(
+            SummaryStatsEntry, profile_data[args.phase]["summary_stats"])
+        column_widths = dict(name=80,
+                             cuda_time_us=12,
+                             pct_cuda_time=12,
+                             invocations=15)
+    elif args.table == "model":
+        entries_and_depths = flatten_entries(
+            ModelStatsEntry, profile_data[args.phase]["model_stats"])
+        column_widths = dict(name=60,
+                             cpu_time_us=12,
+                             cuda_time_us=12,
+                             pct_cuda_time=12,
+                             trace=60)
+
+    # indent entry names based on the depth
+    entries = []
+    for entry, depth in entries_and_depths:
+        entry.name = indent_string(
+            entry.name,
+            indent=depth,
+            indent_style=lambda indent: "|" + "-" * indent + " ")
+        entries.append(entry)
+
+    TablePrinter(type(entries[0]), column_widths).print_table(entries)