Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 33650733 · 33650733
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/neuron/1_core/test_neuron_quant.py
+++ b/tests/neuron/1_core/test_neuron_quant.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.model_executor.layers.quantization.neuron_quant import (
-    NeuronQuantConfig)
-def test_get_supported_act_dtypes():
-    neuron_quant_config = NeuronQuantConfig()
-    supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
-    target_list = ["any_dtype1", "any_dtype2"]
-    for dtype in target_list:
-        assert dtype in supported_act_dtypes
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-import pytest
-import torch
-import torch.nn.functional as F
-from vllm.utils import cdiv
-class BlockDiagonalCausalFromBottomRightMask:
-    @staticmethod
-    def _from_seqlens(query_lens, seq_lens, block_size=None):
-        from torch import logical_and, logical_or
-        contexted = block_size is None
-        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        n_queries = sum(query_lens)
-        num_seqs = len(query_lens)
-        if contexted:
-            key_lens_blockaligned = seq_lens
-        else:
-            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
-            offset_per_seq = n_blocks_per_seq * block_size
-            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
-        n_keys = sum(key_lens_blockaligned)
-        a = (torch.arange(n_queries).reshape(n_queries,
-                                             1).expand(n_queries, n_keys))
-        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
-        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
-        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
-        prior_mask = torch.zeros(n_queries, n_keys)
-        new_masks: list[torch.Tensor] = []
-        for seq_id in range(num_seqs):
-            ri = q_cumsum[seq_id]
-            ci = k_cumsum[seq_id]
-            nr = query_lens[seq_id]
-            if contexted:
-                nc = seq_lens[seq_id]
-                a_offset = ci + nc - ri - nr
-                new_mask = (a + a_offset) >= b
-            else:
-                nc = context_lens[seq_id]
-                a_offset = ci + nc - 1
-                new_mask = a_offset >= b
-            left_mask = b >= ci
-            top_mask = a >= ri
-            bottom_mask = a < (ri + nr)
-            new_mask = logical_and(
-                logical_and(logical_and(new_mask, left_mask), top_mask),
-                bottom_mask,
-            )
-            prior_mask = logical_or(prior_mask, new_mask)
-            new_masks = new_masks + [new_mask]
-        return prior_mask
-    @staticmethod
-    def from_seqlens(query_lens, seq_lens, block_size=None):
-        contexted = block_size is None
-        if contexted:
-            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens)
-            active_mask = None
-        else:
-            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens, block_size)
-            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, query_lens)
-        return prior_mask, active_mask
-def ref_softmax(x: torch.Tensor,
-                dim: int,
-                mixed_precision=False,
-                return_max_reduce=False):
-    max_value = torch.amax(x, dim=dim, keepdims=True)
-    exp = torch.exp(x - max_value)
-    if mixed_precision:
-        sum_value = torch.sum(exp.astype(torch.float32),
-                              dim=dim,
-                              keepdims=True).astype(x.dtype)
-    else:
-        sum_value = torch.sum(exp, dim=dim, keepdims=True)
-    if return_max_reduce:
-        return exp / sum_value, max_value, torch.reciprocal(sum_value)
-    return exp / sum_value
-def ref_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
-    return_max_reduce: Optional[bool] = False,
-) -> torch.Tensor:
-    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    if attn_mask is not None:
-        masked_score = scaled_qk + attn_mask.float()
-    if return_max_reduce:
-        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
-            masked_score, dim=-1, return_max_reduce=True)
-    else:
-        norm_score = ref_softmax(masked_score, dim=-1)
-    out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
-    if return_max_reduce:
-        return (
-            out,
-            cached_max,
-            cached_sum_reciprocal,
-            norm_score,
-            masked_score,
-            scaled_qk,
-        )
-    else:
-        return (out, )
-def ref_context_attention(
-    query,
-    key,
-    value,
-    query_lens,
-    seq_lens,
-    head_size,
-    num_queries_per_kv,
-    return_max_reduce=False,
-):
-    scale = float(1.0 / (head_size**0.5))
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens)
-    # convert binary mask to -inf values
-    attn_mask = torch.logical_not(attn_mask)
-    attn_mask = attn_mask.float() * -30000
-    output, *debug_tensors = ref_masked_attention(
-        query,
-        key,
-        value,
-        scale,
-        attn_mask,
-        return_max_reduce=return_max_reduce,
-    )
-    output = output.unsqueeze(1)
-    if return_max_reduce:
-        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
-            debug_tensors)
-        return (
-            output,
-            cached_max,
-            cached_sum_reciprocal,
-            lse,
-            masked_score,
-            scaled_qk,
-        )
-    else:
-        return output
-def sample_inputs(
-    prefill_batch_size,
-    decode_batch_size,
-    min_query_len,
-    max_query_len,
-    min_ctx_len,
-    max_ctx_len,
-    block_size,
-    num_heads,
-    num_kv_heads,
-    head_size,
-    dtype,
-):
-    batch_size = prefill_batch_size + decode_batch_size
-    max_model_len = (max_query_len + max_ctx_len) * 4
-    max_block_per_request = max_model_len // block_size
-    cache_size = (batch_size * max_block_per_request) + 2
-    prefill_ctx_lens = torch.randint(min_ctx_len,
-                                     max_ctx_len + 1, (prefill_batch_size, ),
-                                     dtype=torch.long).tolist()
-    decode_ctx_lens = torch.randint(min_ctx_len,
-                                    max_ctx_len + 1, (decode_batch_size, ),
-                                    dtype=torch.long).tolist()
-    ctx_lens = prefill_ctx_lens + decode_ctx_lens
-    query_lens = torch.randint(
-        min_query_len,
-        max_query_len + 1,
-        (prefill_batch_size, ),
-        dtype=torch.long,
-    ).tolist() + [1 for _ in range(decode_batch_size)]
-    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
-    num_tokens = sum(query_lens)
-    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
-    query.uniform_(-1, 1)
-    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
-    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
-    kv.uniform_(-1, 1)
-    key, value = kv.unbind(dim=1)
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
-    values = values[torch.randperm(cache_size)]
-    block_table = values[:batch_size * max_block_per_request].view(
-        batch_size, max_block_per_request)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
-                                            dtype=torch.long),
-                               dim=0)
-    # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
-    for i in range(batch_size):
-        for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
-        cur_ctx = 0
-        block_id = 0
-        while cur_ctx < b_ctx_len[i]:
-            start_loc = b_seq_start_loc[i] + cur_ctx
-            if cur_ctx + block_size > b_ctx_len[i]:
-                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
-            else:
-                end_loc = start_loc + block_size
-            start_slot = block_table[i, block_id] * block_size
-            end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
-            cur_ctx += block_size
-            block_id += 1
-    kv_cache = torch.stack([k_cache, v_cache])
-    return (
-        query,
-        k,
-        v,
-        kv_cache,
-        block_table,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-    )
-def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
-                            num_blocks):
-    context_lens = seq_lens - query_lens
-    blocks_per_seq = (context_lens + block_size - 1) // block_size
-    num_seqs = len(seq_lens)
-    active_blocks: list[int] = []
-    for seq_id in range(num_seqs):
-        active_blocks = (
-            active_blocks +
-            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
-    return F.pad(
-        torch.tensor(active_blocks, dtype=torch.int32),
-        (0, num_blocks - len(active_blocks)),
-        "constant",
-        0,
-    )
-@pytest.mark.parametrize(
-    "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
-    [
-        # Test minimal configurations (small block size)
-        (1, 199, 1, 512, 4, 2, 8, False
-         ),  # minimal block size, small dimensions
-        (1, 199, 1, 512, 4, 2, 8, True),  # same with mixed precision
-        # Test common/medium configurations
-        (4, 12, 32, 2048, 32, 8, 64, False),  # common case, larger heads
-        (4, 12, 32, 2048, 16, 4, 32,
-         True),  # medium size, mixed precision, grouped-query attention (GQA)
-        # Test large configurations
-        (4, 12, 256, 8192, 8, 1, 128, False),  # large blocks, large head size
-        (4, 12, 256, 8192, 64, 8, 64, True),  # large blocks, many heads
-        # Test asymmetric configurations
-        (2, 24, 64, 4096, 12, 4, 96, False),  # varied batch sizes
-        (8, 8, 128, 2048, 24, 2, 48, True),  # balanced batches
-        # Test edge cases
-        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
-        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
-        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
-        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
-    ])
-@torch.inference_mode()
-def test_contexted_kv_attention(
-    monkeypatch: pytest.MonkeyPatch,
-    prefill_batch_size: int,
-    decode_batch_size: int,
-    num_heads: int,
-    num_queries_per_kv: int,
-    head_size: int,
-    block_size: int,
-    large_tile_size,
-    mixed_precision: bool,
-) -> None:
-    import torch_xla.core.xla_model as xm
-    from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
-                                                   reorder_context_mask)
-    assert large_tile_size % block_size == 0
-    device = xm.xla_device()
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
-    with monkeypatch.context() as m:
-        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-        torch.manual_seed(0)
-        torch.set_printoptions(sci_mode=False)
-        torch.set_default_device("cpu")
-        dtype = torch.float32
-        min_ctx_len = 32
-        max_ctx_len = 1024
-        min_query_len = 16
-        max_query_len = 512
-        num_kv_heads = num_heads // num_queries_per_kv
-        (
-            query,
-            k_active,
-            v_active,
-            kv_cache,
-            block_table,
-            key,
-            value,
-            query_lens,
-            seq_lens,
-        ) = sample_inputs(
-            prefill_batch_size=prefill_batch_size,
-            decode_batch_size=decode_batch_size,
-            min_query_len=min_query_len,
-            max_query_len=max_query_len,
-            min_ctx_len=min_ctx_len,
-            max_ctx_len=max_ctx_len,
-            block_size=block_size,
-            num_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_size=head_size,
-            dtype=dtype,
-        )
-        output_ref = ref_context_attention(
-            query,
-            key,
-            value,
-            query_lens,
-            seq_lens,
-            head_size,
-            num_queries_per_kv,
-            return_max_reduce=False,
-        )
-        # build neuron program
-        B_P_SIZE = 128
-        assert (large_tile_size >= B_P_SIZE
-                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
-        def pad_to_multiple(a, b):
-            return cdiv(a, b) * b
-        def pad_to_next_power_of_2(a):
-            assert a > 0
-            return 2**int(a - 1).bit_length()
-        # calculate input shapes
-        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        num_active_blocks = cdiv(context_lens, block_size).sum().item()
-        num_active_blocks = pad_to_multiple(num_active_blocks,
-                                            large_tile_size // block_size)
-        context_kv_len = num_active_blocks * block_size
-        assert (
-            context_kv_len %
-            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
-        # pad QKV tensors
-        pad_dims = (
-            0,
-            0,
-            0,
-            0,
-            0,
-            max_num_queries - query.shape[0],
-        )
-        query = F.pad(query, pad_dims, "constant", 0)
-        k = F.pad(k_active, pad_dims, "constant", 0)
-        v = F.pad(v_active, pad_dims, "constant", 0)
-        # permute QKV tensors
-        # query: (1, n_heads, d, seq_q)
-        # key:   (1, n_kv_heads, d, seq_k)
-        # value: (1, n_kv_heads, seq_v, d)
-        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
-        # transform block table
-        active_block_table = get_active_block_tables(
-            block_table.cpu(),
-            torch.tensor(query_lens).cpu(),
-            torch.tensor(seq_lens).cpu(),
-            block_size,
-            num_active_blocks,
-        )
-        # Build attention masks
-        prior_mask, active_mask = (
-            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-                query_lens, seq_lens, block_size=block_size))
-        prior_mask_padded = F.pad(
-            prior_mask,
-            (
-                0,
-                context_kv_len - prior_mask.shape[1],
-                0,
-                max_num_queries - prior_mask.shape[0],
-            ),
-            "constant",
-            0,
-        ).bool()
-        active_mask_padded = F.pad(
-            active_mask,
-            (
-                0,
-                max_num_queries - active_mask.shape[1],
-                0,
-                max_num_queries - active_mask.shape[0],
-            ),
-            "constant",
-            0,
-        ).bool()
-        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
-                                 dim=1)
-        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
-                                         block_size)
-        input_args = (
-            query.to(device=device),
-            k.to(device=device),
-            v.to(device=device),
-            kv_cache.to(device=device),
-            active_block_table.to(device=device),
-            attn_mask.to(device=device),
-        )
-        input_kwargs = dict(
-            n_kv_head=num_kv_heads,
-            head_size=head_size,
-            mixed_precision=mixed_precision,
-            LARGE_TILE_SZ=large_tile_size,
-        )
-        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
-        num_actual_tokens = sum(query_lens)
-        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
-        output_nki = output_nki[0, :num_actual_tokens, :, :]
-        output_ref_padded = F.pad(
-            output_ref,
-            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
-            "constant",
-            0,
-        )
-        output_ref = output_ref_padded.transpose(
-            0, 1)[0, :num_actual_tokens, :, :]
-        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for miscellaneous utilities
-"""
-import pytest
-import torch
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.platforms import current_platform
-@pytest.mark.parametrize(
-    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
-        (16, False, 32, 32, 1024, True),
-        (16, False, 32, 128, 1024, True),
-        (16, True, 32, 32, 1024, True),
-        (16, True, 32, 128, 1024, True),
-        (16, False, 32, 128, 1024, False),
-        (16, True, 32, 128, 1024, False),
-    ])
-def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
-                                  head_size, seq_len, use_key):
-    import torch_xla.core.xla_model as xm
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    batch_size = 1
-    base = 10000
-    num_heads = 8
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device="cpu")
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=torch.float32,
-                        device="cpu")
-    key = torch.randn_like(query) if use_key else None
-    assert positions.is_cpu, \
-        "reference input tensor is expected to be CPU tensor."
-    ref_query, ref_key = rot.to(device="cpu").forward_native(
-        positions, query, key)
-    out_query, out_key = rot.to(device=device).forward_neuron(
-        positions.to(device=device), query.to(device=device),
-        key.to(device=device) if key is not None else None)
-    if use_key:
-        assert out_query.is_xla and out_key.is_xla, \
-            "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out_key.cpu(),
-                                   ref_key,
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out_key is None, "expected returned key to be None"
-        assert out_query.is_xla, \
-            "output tensor is expected to be XLA tensor"
-    torch.testing.assert_close(out_query.cpu(),
-                               ref_query,
-                               atol=1e-2,
-                               rtol=1e-2)
--- a/tests/neuron/2_core/test_comm_ops.py
+++ b/tests/neuron/2_core/test_comm_ops.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
-from typing import Callable
-from unittest.mock import patch
-import pytest
-import torch
-import torch_xla.distributed.xla_multiprocessing as xmp
-from typing_extensions import ParamSpec
-from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.utils import get_distributed_init_method, get_open_port
-_P = ParamSpec("_P")
-def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to reinitialize the Neuron Runtime before executing a test.
-    This is necessary for distributed tests which need to reallocate Neuron
-    Cores to separate subprocesses.
-    """
-    @functools.wraps(f)
-    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        runtime = torch.classes.neuron.Runtime()
-        runtime.initialize()
-        runtime.unsafe_close()
-        f(*args, **kwargs)
-        runtime.initialize()
-    return wrapper
-def all_gather_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-    num_dimensions = 3
-    tensor_size = list(range(2, num_dimensions + 2))
-    total_size = 1
-    for s in tensor_size:
-        total_size *= s
-    all_gather_dimension = -1
-    all_tensors = [
-        torch.arange(total_size, dtype=torch.float32,
-                     device="xla").reshape(tensor_size) * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.cat(all_tensors, dim=all_gather_dimension)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_gather(t, all_gather_dimension)
-    torch.testing.assert_close(t, expected)
-def all_reduce_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-    num_elements = 8
-    all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_reduce(t)
-    torch.testing.assert_close(t, expected)
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target",
-                         [all_reduce_test_worker, all_gather_test_worker])
-@reinitialize_neuron_runtime
-def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
-                                              test_target):
-    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
-               return_value=False):
-        distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
-        monkeypatch.setenv("VLLM_USE_V1", "1")
-        monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
-        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
-                           ','.join(['1' for _ in range(tp_size)]))
-        xmp.spawn(test_target, args=(tp_size, distributed_init_method))
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
-import os
-import shutil
-import tempfile
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-from vllm import LLM, SamplingParams
-def patch_eagle_draft_with_lm_head(target_model_id: str,
-                                   draft_model_id: str) -> str:
-    # In NxDI, draft model checkpoint must include lm_head weights from target
-    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
-    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
-    # #eagle-checkpoint-compatibility
-    final_draft_dir = "/tmp/patched_eagle_draft"
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        target_dir = snapshot_download(repo_id=target_model_id,
-                                       local_dir=os.path.join(
-                                           tmp_dir, "target"))
-        draft_dir = snapshot_download(repo_id=draft_model_id,
-                                      local_dir=os.path.join(tmp_dir, "draft"))
-        lm_head_key = "lm_head.weight"
-        index_path = os.path.join(target_dir, "model.safetensors.index.json")
-        with open(index_path) as f:
-            index = json.load(f)
-        shard_name = index["weight_map"][lm_head_key]
-        target_safetensor_path = os.path.join(target_dir, shard_name)
-        with safe_open(target_safetensor_path, framework="pt") as f:
-            target_lm_head = f.get_tensor(lm_head_key)
-        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
-        draft_state_dict = torch.load(draft_path, map_location="cpu")
-        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
-        torch.save(draft_state_dict, draft_path)
-        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
-    return final_draft_dir
-def test_eagle():
-    patched_draft_path = patch_eagle_draft_with_lm_head(
-        target_model_id="meta-llama/Llama-2-7b-hf",
-        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
-    llm = LLM(
-        model="meta-llama/Llama-2-7b-hf",
-        speculative_config={
-            "model": patched_draft_path,
-            "num_speculative_tokens": 5,
-            "max_model_len": 128
-        },
-        max_num_seqs=1,
-        max_model_len=128,
-        tensor_parallel_size=2,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-            "fused_qkv": True
-        },
-    )
-    prompts = [
-        "The president of the United States is",
-    ]
-    outputs = llm.generate(prompts, SamplingParams(top_k=1))
-    expected_output = " the head of state and head of government of " \
-    "the United States. The president direct"
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-    print("Neuron Eagle speculation test passed.")
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm import LLM, SamplingParams
-def test_mistral():
-    llm = LLM(model="mistralai/Mistral-7B-v0.1",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=128,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True
-              })
-    # Send more prompts than the compiled batch size (4) and request
-    # varying generation lengths to test accuracy related to Neuron
-    # specific sequence id sorting.
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-        "What is Annapurna labs?",
-        "I believe the meaning of life is",
-        "Tell me a story about a brave knight",
-        "Hello, my name is Llama",
-    ]
-    sampling_params = [
-        SamplingParams(top_k=1, max_tokens=10),
-        SamplingParams(top_k=1, max_tokens=20),
-        SamplingParams(top_k=1, max_tokens=30),
-        SamplingParams(top_k=1, max_tokens=40),
-        SamplingParams(top_k=1, max_tokens=50),
-        SamplingParams(top_k=1, max_tokens=60)
-    ]
-    outputs = llm.generate(prompts, sampling_params)
-    expected_outputs = [
-        " the most powerful person in the world. He is",
-        " a city of many faces. It is a city of history, culture, art, "
-        "fashion, and",
-        "\n\nAnnapurna Labs is a semiconductor company that was founded "
-        "in 2013 by Amazon. The company is",
-        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
-        "believe that happiness is a state of mind.\n\nI believe that "
-        "happiness is a journey.\n\nI believe",
-        " who rescued a princess from a dragon.\n\nTell me a story about"
-        " a princess who rescued herself from a dragon.\n\nTell me a "
-        "story about a princess who rescued herself from a dragon and "
-        "then rescued a knight from",
-        " and I am a 10 year old male. I am a very friendly and "
-        "affectionate boy who loves to be around people. I am a very "
-        "active boy who loves to play and run around. I am a very smart "
-        "boy who loves to learn new things. I am a very loyal boy"
-    ]
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-    print("Neuron Mistral test passed.")
--- a/tests/neuron/2_core/test_multi_lora.py
+++ b/tests/neuron/2_core/test_multi_lora.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from huggingface_hub import snapshot_download
-from vllm import LLM, SamplingParams
-from vllm.lora.request import LoRARequest
-def test_llama_single_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=1,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_1])
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
-def test_llama_multiple_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled":
-                  False,
-                  "skip_warmup":
-                  True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }, {
-                      "name": "lora_id_2",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=2,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_2])
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-def register_prithvi_india():
-    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia"  # noqa: E501
-def register_prithvi_valencia():
+def register_prithvi():
-    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia"  # noqa: E501
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor"  # noqa: E501
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -8,7 +8,7 @@ import datetime
 import os
 import tempfile
 import urllib.request
-from collections.abc import AsyncGenerator, Sequence
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 import albumentations
@@ -234,6 +234,8 @@ def load_image(
 class PrithviMultimodalDataProcessor(IOProcessor):
+    indices = [0, 1, 2, 3, 4, 5]
    def __init__(self, vllm_config: VllmConfig):
        super().__init__(vllm_config)
@@ -359,14 +361,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
        return prompts
-    async def pre_process_async(
-        self,
-        prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
-        return self.pre_process(prompt, request_id, **kwargs)
    def post_process(
        self,
        model_output: Sequence[PoolingRequestOutput],
@@ -420,30 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor):
                                  format="tiff",
                                  data=out_data,
                                  request_id=request_id)
-    async def post_process_async(
-        self,
-        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
-        return self.post_process(collected_output, request_id, **kwargs)
-class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
-    def __init__(self, vllm_config: VllmConfig):
-        super().__init__(vllm_config)
-        self.indices = [1, 2, 3, 8, 11, 12]
-class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
-    def __init__(self, vllm_config: VllmConfig):
-        super().__init__(vllm_config)
-        self.indices = [0, 1, 2, 3, 4, 5]
--- a/tests/plugins/prithvi_io_processor_plugin/setup.py
+++ b/tests/plugins/prithvi_io_processor_plugin/setup.py
@@ -9,8 +9,7 @@ setup(
    packages=["prithvi_io_processor"],
    entry_points={
        "vllm.io_processor_plugins": [
-            "prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india",  # noqa: E501
+            "prithvi_to_tiff = prithvi_io_processor:register_prithvi",  # noqa: E501
-            "prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia",  # noqa: E501
        ]
    },
 )
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -7,12 +7,11 @@ import requests
 from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
-from vllm.entrypoints.llm import LLM
 from vllm.entrypoints.openai.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
-MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
@@ -23,61 +22,7 @@ def test_loading_missing_plugin():
        get_io_processor(vllm_config, "wrong_plugin")
-def test_loading_engine_with_wrong_plugin():
+@pytest.fixture(scope="function")
-    with pytest.raises(ValueError):
-        LLM(
-            model=MODEL_NAME,
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=32,
-            io_processor_plugin="wrong_plugin",
-        )
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
-    img_prompt = dict(
-        data=image_url,
-        data_format="url",
-        image_format="tiff",
-        out_data_format="b64_json",
-    )
-    pooling_params = PoolingParams(task="encode", softmax=False)
-    with vllm_runner(
-            model_name,
-            runner="pooling",
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=1,
-            io_processor_plugin="prithvi_to_tiff_valencia",
-    ) as llm_runner:
-        pooler_output = llm_runner.get_llm().encode(
-            img_prompt,
-            pooling_params=pooling_params,
-        )
-    output = pooler_output[0].outputs
-    # verify the output is formatted as expected for this plugin
-    assert all(
-        hasattr(output, attr)
-        for attr in ["type", "format", "data", "request_id"])
-    # We just check that the output is a valid base64 string.
-    # Raises an exception and fails the test if the string is corrupted.
-    base64.b64decode(output.data)
-@pytest.fixture(scope="module")
 def server():
    args = [
        "--runner",
@@ -90,7 +35,9 @@ def server():
        "--max-num-seqs",
        "32",
        "--io-processor-plugin",
-        "prithvi_to_tiff_valencia"
+        "prithvi_to_tiff",
+        "--model-impl",
+        "terratorch",
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -113,6 +60,7 @@ async def test_prithvi_mae_plugin_online(
        },
        "priority": 0,
        "model": model_name,
+        "softmax": False
    }
    ret = requests.post(
@@ -135,3 +83,43 @@ async def test_prithvi_mae_plugin_online(
    # We just check that the output is a valid base64 string.
    # Raises an exception and fails the test if the string is corrupted.
    base64.b64decode(plugin_data["data"])
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+    pooling_params = PoolingParams(task="encode", softmax=False)
+    with vllm_runner(
+            model_name,
+            runner="pooling",
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=1,
+            model_impl="terratorch",
+            io_processor_plugin="prithvi_to_tiff",
+    ) as llm_runner:
+        pooler_output = llm_runner.get_llm().encode(
+            img_prompt,
+            pooling_params=pooling_params,
+        )
+    output = pooler_output[0].outputs
+    # verify the output is formatted as expected for this plugin
+    assert all(
+        hasattr(output, attr)
+        for attr in ["type", "format", "data", "request_id"])
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(output.data)
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -27,7 +27,7 @@ def use_v0_only(monkeypatch):
                    reason="ModelOpt FP8 is not supported on this GPU type.")
 def test_modelopt_fp8_checkpoint_setup(vllm_runner):
    """Test ModelOpt FP8 checkpoint loading and structure validation."""
-    # TODO: provide a small publically available test checkpoint
+    # TODO: provide a small publicly available test checkpoint
    model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
                  "TinyLlama-1.1B-Chat-v1.0-fp8-0710")

--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -75,5 +75,25 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
        print(output)
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now")
+def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
+    torch._dynamo.reset()
+    model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
+                  "-0.14.0.dev")
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+        assert output
+        print(output)
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm import SamplingParams
-from vllm.config import LoadConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader import get_model_loader
 load_format = "runai_streamer"

--- a/tests/runai_model_streamer_test/test_runai_utils.py
+++ b/tests/runai_model_streamer_test/test_runai_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import glob
+import os
+import tempfile
+import huggingface_hub.constants
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
+                                                 list_safetensors)
+def test_is_runai_obj_uri():
+    assert is_runai_obj_uri("gs://some-gcs-bucket/path")
+    assert is_runai_obj_uri("s3://some-s3-bucket/path")
+    assert not is_runai_obj_uri("nfs://some-nfs-path")
+def test_runai_list_safetensors_local():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors", "*.json"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+        parentdir = [
+            os.path.dirname(safetensor) for safetensor in safetensors
+        ][0]
+        files = list_safetensors(parentdir)
+        assert len(safetensors) == len(files)
+if __name__ == "__main__":
+    test_is_runai_obj_uri()
+    test_runai_list_safetensors_local()
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit(
    beam_width: int,
 ) -> None:
    # example_prompts[1]&[3]&[7] fails due to unknown reason even without
-    # concurency limit. skip them for now.
+    # concurrency limit. skip them for now.
    example_prompts = (example_prompts[:8])
    concurrency_limit = 2
    assert len(example_prompts) > concurrency_limit

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -161,11 +161,11 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
        model = vllm_runner(
            model_ref,
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+        pytest.fail("Expected RuntimeError for extra config keys")
    except RuntimeError:
        out, err = capfd.readouterr()
        combined_output = out + err
-        assert ("ValueError: Model loader extra config "
+        assert ("ValueError: Unexpected extra config keys for load "
-                "is not supported for load "
                "format auto") in combined_output
    finally:
        del model
@@ -181,11 +181,12 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
            model_ref,
            load_format="safetensors",
            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+        pytest.fail("Expected RuntimeError for extra config keys")
    except RuntimeError:
        out, err = capfd.readouterr()
        combined_output = out + err
-        assert ("ValueError: Model loader extra config is not supported "
+        assert ("ValueError: Unexpected extra config keys "
                "for load format safetensors") in combined_output
    finally:
        del model

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,8 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
 import pytest
 from vllm.compilation.backends import VllmBackend
-from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
+from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field,
-                         get_field, update_config)
+                         update_config)
+from vllm.config.load import LoadConfig
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform

--- a/tests/tool_use/test_openai_tool_parser.py
+++ b/tests/tool_use/test_openai_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import pytest
+from openai_harmony import (Conversation, DeveloperContent,
+                            HarmonyEncodingName, Message, Role, SystemContent,
+                            load_harmony_encoding)
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+MODEL = "gpt2"
+@pytest.fixture(scope="module")
+def openai_tokenizer():
+    # The parser does not use the tokenizer, but the constructor requires it.
+    return get_tokenizer(MODEL)
+@pytest.fixture
+def openai_tool_parser(openai_tokenizer):
+    return OpenAIToolParser(openai_tokenizer)
+@pytest.fixture(scope="module")
+def harmony_encoding():
+    return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall],
+    expected_tool_calls: list[ToolCall],
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16  # Default from protocol.py
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(
+            Role.SYSTEM,
+            SystemContent.new(),
+        ),
+        Message.from_role_and_content(
+            Role.DEVELOPER,
+            DeveloperContent.new().with_instructions("Talk like a pirate!")),
+        Message.from_role_and_content(Role.USER, "Arrr, how be you?"),
+        Message.from_role_and_content(Role.ASSISTANT,
+                                      "This is a test").with_channel("final")
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo, Role.ASSISTANT)
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert not extracted_info.tools_called
+    assert extracted_info.tool_calls == []
+    assert extracted_info.content == "This is a test"
+def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(Role.USER,
+                                      "What is the weather in Tokyo?"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.',  #  noqa: E501
+        ).with_channel("analysis"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_current_weather").with_content_type("json"),
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo, Role.ASSISTANT)
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert extracted_info.tools_called
+    expected_tool_calls = [
+        ToolCall(function=FunctionCall(
+            name="get_current_weather",
+            arguments=json.dumps({"location": "Tokyo"}),
+        ))
+    ]
+    assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
+    assert extracted_info.content is None
+def test_extract_tool_calls_multiple_tools(
+    openai_tool_parser,
+    harmony_encoding,
+):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(
+            Role.USER, "What is the weather in Tokyo based on where I'm at?"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
+        ).with_channel("analysis"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_current_weather").with_content_type("json"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_user_location").with_content_type("json"),
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo,
+        Role.ASSISTANT,
+    )
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert extracted_info.tools_called
+    expected_tool_calls = [
+        ToolCall(function=FunctionCall(
+            name="get_current_weather",
+            arguments=json.dumps({"location": "Tokyo"}),
+        )),
+        ToolCall(function=FunctionCall(
+            name="get_user_location",
+            arguments=json.dumps({"location": "Tokyo"}),
+        ))
+    ]
+    assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
+    assert extracted_info.content is None
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
        expected_value=0.76),  # no bias
    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
    # so only one of these tests can run in a single call to pytest. As
-    # a follow up, move this into the LM-EVAL section of the CI.
+    # a follow-up, move this into the LM-EVAL section of the CI.
    # GSM8KAccuracyTestConfig(
    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
    #     expected_value=0.66),  # bias in QKV layers