Merge remote-tracking branch 'mirror/main'

2216a4e5 · zhuwenwen · ad385667 · 51c24c97 · 2216a4e5 · 2216a4e5
Commit 2216a4e5 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
+from typing import List, Type
+
 import pytest
 import torch.nn.functional as F

-from ....conftest import IMAGE_ASSETS
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
 from ..utils import check_embeddings_close

+HF_TEXT_PROMPTS = [
+    # T -> X
+    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
+    # T -> X
+    "Retrieve an image of this caption: cherry blossom",
+]
+
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    # T + I -> X
    "stop_sign":
    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+    # I -> X
    "cherry_blossom":
-    "<|image_1|> Represent the given image with the following question: What is in the image",  # noqa: E501
+    "<|image_1|> Represent the given image for classification",  # noqa: E501
 })

 MODELS = ["TIGER-Lab/VLM2Vec-Full"]


-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    input_texts: List[str],
+    input_images: PromptImageInput,
    model: str,
+    *,
    dtype: str,
 ) -> None:
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     dtype=dtype,
+    with vllm_runner(model, task="embedding", dtype=dtype,
                     enforce_eager=True) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.encode(input_texts, images=input_images)

-    with hf_runner(model, dtype=dtype) as hf_model:
-        all_inputs = hf_model.get_inputs(example_prompts)
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)

        all_outputs = []
        for inputs in all_inputs:
@@ -60,3 +72,53 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -3,10 +3,10 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union

 import torch

-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
+from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
-from vllm.utils import is_cpu

 TokensText = Tuple[List[int], str]

@@ -19,7 +19,7 @@ def check_outputs_equal(
    name_1: str,
 ):
    """
-    Compare the two sequences generated by different models, 
+    Compare the two sequences generated by different models,
    which should be equal.
    """
    assert len(outputs_0_lst) == len(outputs_1_lst)
@@ -248,13 +248,14 @@ def check_logprobs_close(


 def build_model_context(model_name: str,
+                        task: TaskOption = "auto",
                        tokenizer_name: Optional[str] = None,
                        trust_remote_code: bool = False,
                        dtype: Optional[Union[str, torch.dtype]] = None,
                        mm_processor_kwargs: Optional[Dict] = None,
                        limit_mm_per_prompt: Optional[Dict] = None):
    """Creates an InputContext for a given model.
-    
+
    Args:
        model_name: Name of the model being considered.
        tokenizer_name: Name of the tokenizer being considered.
@@ -269,11 +270,12 @@ def build_model_context(model_name: str,
    if tokenizer_name is None:
        tokenizer_name = model_name
    if dtype is None:
-        dtype = "bfloat16" if is_cpu() else "half"
+        dtype = "bfloat16" if current_platform.is_cpu() else "half"

    model_config = ModelConfig(
        model_name,
-        tokenizer_name,
+        task=task,
+        tokenizer=tokenizer_name,
        tokenizer_mode="auto",
        trust_remote_code=trust_remote_code,
        dtype=dtype,

--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -59,15 +59,7 @@ async def test_evil_forward(tmp_socket):
        await asyncio.sleep(2.0)
        await client.check_health()

-        # Throws an error in first forward pass.
-        with pytest.raises(RAISED_ERROR):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=uuid.uuid4()):
-                pass
-        assert client.errored
-
-        # Engine is errored, should get ENGINE_DEAD_ERROR.
+        # Throws an error that should get ENGINE_DEAD_ERROR.
        with pytest.raises(MQEngineDeadError):
            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
@@ -149,7 +141,7 @@ async def test_failed_abort(tmp_socket):
        client = await engine.make_client()
        assert client.is_running

-        # Firsh check health should work.
+        # First check health should work.
        await client.check_health()

        # Trigger an abort on the client side.
@@ -174,6 +166,45 @@ async def test_failed_abort(tmp_socket):
        client.close()


+@pytest.mark.asyncio
+async def test_batch_error(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_abort) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # First check health should work.
+        await client.check_health()
+
+        # Batch of requests
+        async def do_generate(client):
+            # min_tokens=2048 to keep busy the engine busy
+            # to get enough time to get process a request
+            # that will crash the engine
+            params = SamplingParams(min_tokens=2048, max_tokens=2048)
+            async for _ in client.generate(prompt="Hello my name is",
+                                           sampling_params=params,
+                                           request_id=uuid.uuid4()):
+                pass
+
+        tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
+
+        # This request will force a processing batch to raise
+        # an exception and next the engine get errored
+        await client.abort(request_id="foo")
+
+        # The batch of those request failed, then they
+        # should get the same exception as a MQEngineDeadError.
+        errors = await asyncio.gather(*tasks, return_exceptions=True)
+        for e in errors:
+            assert isinstance(e, MQEngineDeadError)
+            assert "KeyError" in repr(e)
+
+        client.close()
+
+
 @pytest.mark.asyncio
 async def test_bad_request(tmp_socket):
    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,

--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):

    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,

    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):

    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):

    model_config = ModelConfig(
        model=MODEL_NAME,
+        task="auto",
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,

--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -221,6 +221,7 @@ def test_max_tokens_kwarg_overrides(num_crops):
    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops

    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
                              trust_remote_code=True,
                              mm_processor_kwargs=mm_processor_kwargs,
                              limit_mm_per_prompt={"image": 1})
@@ -256,6 +257,7 @@ def test_max_tokens_kwarg_overrides(num_crops):
 def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
    """Ensure that max token calcs filters out invalid mm_processor_kwargs"""
    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
                              trust_remote_code=True,
                              mm_processor_kwargs=mm_processor_kwargs,
                              limit_mm_per_prompt={"image": 1})
@@ -278,12 +280,13 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):

 ### Test overrides for the mapper
 @pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE])
-def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
+def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
    """Ensure that the mapper processor kwargs can fall back to HF models."""
    # NOTE - we don't validate bad inputs for the default mapper, because it's
    # through the automodel interface in transformers, so we can't easily
    # inspect what kwargs are or are not allowed.
    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
                              trust_remote_code=True,
                              mm_processor_kwargs={"num_crops": num_crops},
                              limit_mm_per_prompt={"image": 1})
@@ -311,6 +314,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
        init_num_crops, inference_num_crops)

    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
                              trust_remote_code=True,
                              mm_processor_kwargs=init_kwargs,
                              limit_mm_per_prompt={"image": 1})
@@ -348,6 +352,7 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
    """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
    # Should filter out the init time kwargs
    ctx = build_model_context(MULTIMODAL_MODEL_ID,
+                              task="generate",
                              trust_remote_code=True,
                              mm_processor_kwargs=mm_processor_kwargs,
                              limit_mm_per_prompt={"image": 1})

--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -4,8 +4,8 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 import pytest

-from tests.conftest import cleanup
 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory

 MODEL_LEN_LEN = [
    # Example models with sliding window.
@@ -31,7 +31,7 @@ def test_disable_sliding_window(model_len_len, ):
        model_config.max_model_len)

    del vllm_disabled_model
-    cleanup()
+    cleanup_dist_env_and_memory()

    vllm_enabled_model = LLM(model, disable_sliding_window=False)
    vllm_enabled_model.generate("Hi my name is")
@@ -41,4 +41,4 @@ def test_disable_sliding_window(model_len_len, ):
        model_config.max_model_len)

    del vllm_enabled_model
-    cleanup()
+    cleanup_dist_env_and_memory()
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -107,8 +107,7 @@ def validate_generated_texts(hf_runner,
                     quantization='bitsandbytes',
                     load_format='bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=False,
-                     gpu_memory_utilization=0.8) as llm:
+                     enforce_eager=False) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")


--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -57,7 +57,8 @@ def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:

    try:
        model_config = ModelConfig(model_path,
-                                   model_path,
+                                   task="auto",
+                                   tokenizer=model_path,
                                   tokenizer_mode="auto",
                                   trust_remote_code=False,
                                   seed=0,

--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -4,10 +4,10 @@ from typing import List, Optional, Sequence, Tuple, Union
 import pytest

 from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import PromptLogprobs, SampleLogprobs

-from ...conftest import cleanup
 from ...models.utils import (TokensTextLogprobs,
                             TokensTextLogprobsPromptLogprobs,
                             check_logprobs_close, check_outputs_equal)
@@ -44,7 +44,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
        yield llm

        del llm
-        cleanup()
+        cleanup_dist_env_and_memory()

    return generate


--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
-import contextlib
 import functools
 import gc
 from typing import Callable, TypeVar

 import pytest
-import ray
 import torch
 from typing_extensions import ParamSpec

-from vllm.distributed import (destroy_distributed_environment,
-                              destroy_model_parallel)
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig


 @pytest.fixture(autouse=True)
 def cleanup():
-    destroy_model_parallel()
-    destroy_distributed_environment()
-    with contextlib.suppress(AssertionError):
-        torch.distributed.destroy_process_group()
-    ray.shutdown()
-    gc.collect()
-    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory(shutdown_ray=True)


 _P = ParamSpec("_P")

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -2,6 +2,42 @@ import pytest

 from vllm.config import ModelConfig

+
+@pytest.mark.parametrize(("model_id", "expected_task"), [
+    ("facebook/opt-125m", "generate"),
+    ("intfloat/e5-mistral-7b-instruct", "embedding"),
+])
+def test_auto_task(model_id, expected_task):
+    config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+
+    assert config.task == expected_task
+
+
+@pytest.mark.parametrize(("model_id", "bad_task"), [
+    ("facebook/opt-125m", "embedding"),
+    ("intfloat/e5-mistral-7b-instruct", "generate"),
+])
+def test_incorrect_task(model_id, bad_task):
+    with pytest.raises(ValueError, match=r"does not support the .* task"):
+        ModelConfig(
+            model_id,
+            task=bad_task,
+            tokenizer=model_id,
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            seed=0,
+            dtype="float16",
+        )
+
+
 MODEL_IDS_EXPECTED = [
    ("Qwen/Qwen1.5-7B", 32768),
    ("mistralai/Mistral-7B-v0.1", 4096),
@@ -14,7 +50,8 @@ def test_disable_sliding_window(model_id_expected):
    model_id, expected = model_id_expected
    model_config = ModelConfig(
        model_id,
-        model_id,
+        task="auto",
+        tokenizer=model_id,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
@@ -32,7 +69,8 @@ def test_get_sliding_window():
    # when use_sliding_window is False.
    qwen2_model_config = ModelConfig(
        "Qwen/Qwen1.5-7B",
-        "Qwen/Qwen1.5-7B",
+        task="auto",
+        tokenizer="Qwen/Qwen1.5-7B",
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
@@ -49,7 +87,8 @@ def test_get_sliding_window():

    mistral_model_config = ModelConfig(
        "mistralai/Mistral-7B-v0.1",
-        "mistralai/Mistral-7B-v0.1",
+        task="auto",
+        tokenizer="mistralai/Mistral-7B-v0.1",
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
@@ -70,7 +109,8 @@ def test_rope_customization():

    llama_model_config = ModelConfig(
        "meta-llama/Meta-Llama-3-8B-Instruct",
-        "meta-llama/Meta-Llama-3-8B-Instruct",
+        task="auto",
+        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
@@ -82,7 +122,8 @@ def test_rope_customization():

    llama_model_config = ModelConfig(
        "meta-llama/Meta-Llama-3-8B-Instruct",
-        "meta-llama/Meta-Llama-3-8B-Instruct",
+        task="auto",
+        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
@@ -98,7 +139,8 @@ def test_rope_customization():

    longchat_model_config = ModelConfig(
        "lmsys/longchat-13b-16k",
-        "lmsys/longchat-13b-16k",
+        task="auto",
+        tokenizer="lmsys/longchat-13b-16k",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
@@ -112,7 +154,8 @@ def test_rope_customization():

    longchat_model_config = ModelConfig(
        "lmsys/longchat-13b-16k",
-        "lmsys/longchat-13b-16k",
+        task="auto",
+        tokenizer="lmsys/longchat-13b-16k",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",

--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -32,5 +32,5 @@ def test_scalar_type_min_max(type_tuple):
            max = torch.iinfo(torch_type).max

    print(t, min, max, t.min(), t.max())
-    assert min == t.min()
-    assert max == t.max()
+    assert min == t.min(), f"min: {min} != {t.min()}"
+    assert max == t.max(), f"max: {max} != {t.max()}"
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -46,9 +46,10 @@ def test_filter_subtensors():
 @pytest.fixture(scope="module")
 def llama_2_7b_files():
    with TemporaryDirectory() as cache_dir:
-        input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
+        input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
                                      cache_dir=cache_dir,
-                                      ignore_patterns="*.bin*")
+                                      ignore_patterns=["*.bin*", "original/*"])
+
        yield input_dir


@@ -58,9 +59,12 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
    # Dump worker states to output directory
    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
        path=output_dir)
+
    # Copy metadata files to output directory
    for file in os.listdir(input_dir):
-        if not any(file.endswith(ext) for ext in weights_patterns):
+        if not any(
+                file.endswith(ext) and not os.path.isdir(file)
+                for ext in weights_patterns):
            shutil.copy(f"{input_dir}/{file}", output_dir)



--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -59,7 +59,7 @@ def test_deprecate_kwargs_always():
    with pytest.warns(DeprecationWarning, match="'old_arg'"):
        dummy(old_arg=1)

-    with error_on_warning():
+    with error_on_warning(DeprecationWarning):
        dummy(new_arg=1)


@@ -69,10 +69,10 @@ def test_deprecate_kwargs_never():
    def dummy(*, old_arg: object = None, new_arg: object = None):
        pass

-    with error_on_warning():
+    with error_on_warning(DeprecationWarning):
        dummy(old_arg=1)

-    with error_on_warning():
+    with error_on_warning(DeprecationWarning):
        dummy(new_arg=1)


@@ -86,15 +86,15 @@ def test_deprecate_kwargs_dynamic():
    with pytest.warns(DeprecationWarning, match="'old_arg'"):
        dummy(old_arg=1)

-    with error_on_warning():
+    with error_on_warning(DeprecationWarning):
        dummy(new_arg=1)

    is_deprecated = False

-    with error_on_warning():
+    with error_on_warning(DeprecationWarning):
        dummy(old_arg=1)

-    with error_on_warning():
+    with error_on_warning(DeprecationWarning):
        dummy(new_arg=1)



--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
+import json
+from typing import Generator, List, Optional
+
+import partial_json_parser
+import pytest
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
+                                              ToolCall)
+from vllm.entrypoints.openai.tool_parsers import JambaToolParser
+from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+
+MODEL = "ai21labs/Jamba-tiny-dev"
+
+
+@pytest.fixture(scope="module")
+def jamba_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture
+def jamba_tool_parser(jamba_tokenizer):
+    return JambaToolParser(jamba_tokenizer)
+
+
+def assert_tool_calls(actual_tool_calls: List[ToolCall],
+                      expected_tool_calls: List[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16
+
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def stream_delta_message_generator(
+        jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer,
+        model_output: str) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = jamba_tokenizer.encode(model_output,
+                                           add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[:i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset,
+         new_read_offset) = detokenize_incrementally(
+             tokenizer=jamba_tokenizer,
+             all_input_ids=current_token_ids,
+             prev_tokens=previous_tokens,
+             prefix_offset=prefix_offset,
+             read_offset=read_offset,
+             skip_special_tokens=False,
+             spaces_between_special_tokens=True,
+         )
+
+        current_text = previous_text + delta_text
+
+        delta_message = jamba_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=None,  # type: ignore[arg-type]
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = previous_tokens + new_tokens if previous_tokens\
+            else new_tokens
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+def test_extract_tool_calls_no_tools(jamba_tool_parser):
+    model_output = "This is a test"
+    extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "single_tool",
+        "single_tool_with_content",
+        "parallel_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            None),
+        (
+            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " Sure! let me call the tool for you."),
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   }))),
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Orlando",
+                                                       "state": "FL",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            None)
+    ],
+)
+def test_extract_tool_calls(jamba_tool_parser, model_output,
+                            expected_tool_calls, expected_content):
+    extracted_tool_calls = jamba_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "no_tools",
+        "single_tool",
+        "single_tool_with_content",
+        "parallel_tools",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ('''This is a test''', [], '''This is a test'''),
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " "),
+        (
+            ''' Sure! let me call the tool for you.<tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " Sure! let me call the tool for you."),
+        (
+            ''' <tool_calls>[\n    {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n    {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]</tool_calls>''',  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Dallas",
+                                                       "state": "TX",
+                                                       "unit": "fahrenheit"
+                                                   }))),
+                ToolCall(function=FunctionCall(name="get_current_weather",
+                                               arguments=json.dumps(
+                                                   {
+                                                       "city": "Orlando",
+                                                       "state": "FL",
+                                                       "unit": "fahrenheit"
+                                                   })))
+            ],
+            " ")
+    ],
+)
+def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
+                                      model_output, expected_tool_calls,
+                                      expected_content):
+    other_content: str = ''
+    function_names: List[str] = []
+    function_args_strs: List[str] = []
+    tool_call_idx: int = -1
+    tool_call_ids: List[Optional[str]] = []
+
+    for delta_message in stream_delta_message_generator(
+            jamba_tool_parser, jamba_tokenizer, model_output):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        streamed_tool_calls = delta_message.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                function_args_strs.append("")
+                tool_call_ids.append(None)
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id and not tool_call_ids[tool_call.index]:
+                tool_call_ids[tool_call.index] = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    function_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    function_args_strs[
+                        tool_call.index] += tool_call.function.arguments
+
+    assert other_content == expected_content
+
+    actual_tool_calls = [
+        ToolCall(id=tool_call_id,
+                 function=FunctionCall(
+                     name=function_name,
+                     arguments=partial_json_parser.ensure_json(
+                         function_args_str, Allow.OBJ | Allow.STR)))
+        for tool_call_id, function_name, function_args_str in zip(
+            tool_call_ids, function_names, function_args_strs)
+    ]
+    assert_tool_calls(actual_tool_calls, expected_tool_calls)
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -87,8 +87,19 @@ def test_traces(trace_service):
            f"The fake trace service didn't receive a trace within "
            f"the {timeout} seconds timeout")

-    attributes = decode_attributes(trace_service.request.resource_spans[0].
-                                   scope_spans[0].spans[0].attributes)
+    request = trace_service.request
+    assert len(request.resource_spans) == 1, (
+        f"Expected 1 resource span, "
+        f"but got {len(request.resource_spans)}")
+    assert len(request.resource_spans[0].scope_spans) == 1, (
+        f"Expected 1 scope span, "
+        f"but got {len(request.resource_spans[0].scope_spans)}")
+    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+        f"Expected 1 span, "
+        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+    attributes = decode_attributes(
+        request.resource_spans[0].scope_spans[0].spans[0].attributes)
    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
    assert attributes.get(
        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
@@ -142,8 +153,19 @@ def test_traces_with_detailed_steps(trace_service):
            f"The fake trace service didn't receive a trace within "
            f"the {timeout} seconds timeout")

-    attributes = decode_attributes(trace_service.request.resource_spans[0].
-                                   scope_spans[0].spans[0].attributes)
+    request = trace_service.request
+    assert len(request.resource_spans) == 1, (
+        f"Expected 1 resource span, "
+        f"but got {len(request.resource_spans)}")
+    assert len(request.resource_spans[0].scope_spans) == 1, (
+        f"Expected 1 scope span, "
+        f"but got {len(request.resource_spans[0].scope_spans)}")
+    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+        f"Expected 1 span, "
+        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+    attributes = decode_attributes(
+        request.resource_spans[0].scope_spans[0].spans[0].attributes)
    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
    assert attributes.get(
        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -8,7 +8,7 @@ import time
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union

 import openai
 import pytest
@@ -454,13 +454,13 @@ def multi_process_parallel(


 @contextmanager
-def error_on_warning():
+def error_on_warning(category: Type[Warning] = Warning):
    """
    Within the scope of this context manager, tests will fail if any warning
-    is emitted.
+    of the given category is emitted.
    """
    with warnings.catch_warnings():
-        warnings.simplefilter("error")
+        warnings.filterwarnings("error", category=category)

        yield

@@ -587,7 +587,7 @@ def large_gpu_test(*, min_gb: int):
    )

    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_skipif(fork_new_process_for_each_test(f))
+        return test_skipif(f)

    return wrapper


--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -5,8 +5,9 @@ import pytest
 import torch

 from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_cpu, make_tensor_with_pad
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import _get_graph_batch_size

@@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args,
    return model_runner


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="CPU backend is currently "
                    "unsupported for encoder/ "
                    "decoder models")
@@ -74,7 +75,7 @@ def test_empty_seq_group():
    assert return_seq_lens is None


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="CPU backend is currently "
                    "unsupported for encoder/ "
                    "decoder models")
@@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size):
    assert torch.equal(actual, expected)


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="CPU backend is currently "
                    "unsupported for encoder/ "
                    "decoder models")
@@ -490,7 +491,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
 def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
    """
    Tests that for encoder-decoder models with CUDA Graph capture and replay
-    enabled, the tensors used during the decode phase are correctly padded 
+    enabled, the tensors used during the decode phase are correctly padded
    for varying input batch sizes.
    """
    model_runner = _create_model_runner(

--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.worker import Worker
+
+
+def test_gpu_memory_profiling():
+    # Tests the gpu profiling that happens in order to determine the number of
+    # KV cache blocks that we can allocate on the GPU.
+    # This test mocks the maximum available gpu memory so that it can run on
+    # any gpu setup.
+
+    # Set up engine args to build a worker.
+    engine_args = EngineArgs(model="facebook/opt-125m",
+                             dtype="half",
+                             load_format="dummy")
+    engine_config = engine_args.create_engine_config()
+    engine_config.cache_config.num_gpu_blocks = 1000
+    engine_config.cache_config.num_cpu_blocks = 1000
+
+    # Create the worker.
+    distributed_init_method = get_distributed_init_method(
+        get_ip(), get_open_port())
+    worker = Worker(
+        model_config=engine_config.model_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+        cache_config=engine_config.cache_config,
+        load_config=engine_config.load_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+
+    # Load the model so we can profile it
+    worker.init_device()
+    worker.load_model()
+
+    # Set 10GiB as the total gpu ram to be device-agnostic
+    def mock_mem_info():
+        current_usage = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        mock_total_bytes = 10 * 1024**3
+        free = mock_total_bytes - current_usage
+
+        return (free, mock_total_bytes)
+
+    from unittest.mock import patch
+    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        gpu_blocks, _ = worker.determine_num_available_blocks()
+
+    # Peak vram usage by torch should be 0.7077 GiB
+    # No memory should be allocated outside of torch
+    # 9.0 GiB should be the utilization target
+    # 8.2923 GiB should be available for the KV cache
+    block_size = CacheEngine.get_cache_block_size(
+        engine_config.cache_config, engine_config.model_config,
+        engine_config.parallel_config)
+
+    expected_blocks = (8.2923 * 1024**3) // block_size
+
+    # Check within a small tolerance for portability
+    # Hardware, kernel, or dependency changes could all affect memory
+    # utilization.
+    # A 10 block tolerance here should be about 6MB of wiggle room.
+    assert abs(gpu_blocks - expected_blocks) < 10
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -2,6 +2,10 @@

 CI=${1:-0}

+if [ $CI -eq 1 ]; then
+    set -e
+fi
+
 run_mypy() {
    echo "Running mypy on $1"
    if [ $CI -eq 1 ] && [ -z "$1" ]; then