[Misc] Clean up utils (#27552)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Misc] Clean up utils (#27552)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
7c2bdb83 · Cyrus Leung · GitHub · 9932ed6a · 7c2bdb83 · 7c2bdb83
Unverified Commit 7c2bdb83 authored Oct 27, 2025 by Cyrus Leung Committed by GitHub Oct 27, 2025
12 changed files
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -65,7 +65,9 @@ ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
 cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
 run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
+FlexibleArgumentParser = auto_mock(
+    "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+)


 class MarkdownFormatter(HelpFormatter):

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -45,9 +45,7 @@ from vllm.entrypoints.cli.serve import ServeSubcommand
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import (
-    FlexibleArgumentParser,
-)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GB_bytes
 from vllm.utils.network_utils import get_open_port
 from vllm.utils.torch_utils import cuda_device_count_stateless

--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -4,23 +4,15 @@

 import json
 import os
-import tempfile
-from pathlib import Path
-from unittest.mock import patch

 import pytest
-import torch
 import yaml
 from transformers import AutoTokenizer

-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens

-from vllm.utils import (
-    FlexibleArgumentParser,
-    bind_kv_cache,
-)
-from ..utils import create_new_process_for_each_test, flat_product
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from ..utils import flat_product


 # Tests for FlexibleArgumentParser
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
    assert "-O.mode" in caplog_vllm.text


-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
-
-
-def test_bind_kv_cache_kv_sharing():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    shared_kv_cache_layers = {
-        "layers.2.self_attn": "layers.1.self_attn",
-        "layers.3.self_attn": "layers.0.self_attn",
-    }
-    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
-
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        "model.layers.20.attn": Attention(32, 128, 0.1),
-        "model.layers.28.attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
-
-
-def test_bind_kv_cache_pp():
-    with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
-        # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
-    with set_current_vllm_config(cfg):
-        from vllm.attention import Attention
-
-        ctx = {
-            "layers.0.self_attn": Attention(32, 128, 0.1),
-        }
-        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
-        bind_kv_cache(ctx, kv_cache)
-        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
-        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
-
-
 def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
 ):

--- a/tests/utils_/test_serial_utils.py
+++ b/tests/utils_/test_serial_utils.py
@@ -14,7 +14,7 @@ from vllm.utils.serial_utils import (

 @pytest.mark.parametrize("endianness", ENDIANNESS)
 @pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
-@torch.inference_mode
+@torch.inference_mode()
 def test_encode_and_decode(embed_dtype: str, endianness: str):
    for i in range(10):
        tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)

--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -42,7 +42,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv

 if TYPE_CHECKING:
    from vllm.attention.backends.abstract import AttentionMetadata

--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -44,7 +44,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils impo
 )
 from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv, get_kv_cache_torch_dtype
+from vllm.utils import get_kv_cache_torch_dtype
+from vllm.utils.math_utils import cdiv
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.version import __version__ as VLLM_VERSION


--- a/vllm/entrypoints/anthropic/api_server.py
+++ b/vllm/entrypoints/anthropic/api_server.py
@@ -51,7 +51,8 @@ from vllm.entrypoints.utils import (
    with_cancellation,
 )
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser, set_ulimit
+from vllm.utils import set_ulimit
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.version import __version__ as VLLM_VERSION


--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -18,7 +18,7 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_se
 from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_tcp_uri
 from vllm.utils.system_utils import decorate_logs, set_process_title
 from vllm.v1.engine.core import EngineCoreProc

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -108,7 +108,8 @@ from vllm.entrypoints.utils import (
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, FlexibleArgumentParser, set_ulimit
+from vllm.utils import Device, set_ulimit
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs
 from vllm.v1.engine.exceptions import EngineDeadError

--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -13,7 +13,7 @@ import torch

 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON, triton
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 if HAS_TRITON:
    from vllm.lora.ops.triton_ops import (

--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -48,9 +48,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_s
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import round_up
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.import_utils import has_triton_kernels
+from vllm.utils.math_utils import round_up
 from vllm.utils.torch_utils import is_torch_equal_or_newer

 logger = init_logger(__name__)

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -12,12 +12,10 @@ import signal
 import sys
 import tempfile
 import threading
-import traceback
 import uuid
 import warnings
-import weakref
 from collections.abc import Callable
-from functools import cache, partial, wraps
+from functools import partial, wraps
 from typing import TYPE_CHECKING, Any, TypeVar

 import cloudpickle
@@ -28,34 +26,6 @@ import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.ray.lazy_utils import is_in_ray_actor

-# Import utilities from specialized modules for backward compatibility
-from vllm.utils.argparse_utils import (
-    FlexibleArgumentParser,
-    SortedHelpFormatter,
-    StoreBoolean,
-)
-from vllm.utils.math_utils import (
-    cdiv,
-    next_power_of_2,
-    prev_power_of_2,
-    round_down,
-    round_up,
-)
-from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized
-
-__all__ = [
-    # Argparse utilities
-    "FlexibleArgumentParser",
-    "SortedHelpFormatter",
-    "StoreBoolean",
-    # Math utilities
-    "cdiv",
-    "next_power_of_2",
-    "prev_power_of_2",
-    "round_down",
-    "round_up",
-]
-
 _DEPRECATED_MAPPINGS = {
    "cprofile": "profiling",
    "cprofile_context": "profiling",
@@ -84,12 +54,8 @@ def __dir__() -> list[str]:


 if TYPE_CHECKING:
-    from argparse import Namespace
-
    from vllm.config import ModelConfig, VllmConfig
 else:
-    Namespace = object
-
    ModelConfig = object
    VllmConfig = object

@@ -149,35 +115,33 @@ class Counter:
        self.counter = 0


-def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
-
+class AtomicCounter:
+    """An atomic, thread-safe counter"""

-def update_environment_variables(envs: dict[str, str]):
-    for k, v in envs.items():
-        if k in os.environ and os.environ[k] != v:
-            logger.warning(
-                "Overwriting environment variable %s from '%s' to '%s'",
-                k,
-                os.environ[k],
-                v,
-            )
-        os.environ[k] = v
+    def __init__(self, initial=0):
+        """Initialize a new atomic counter to given initial value"""
+        self._value = initial
+        self._lock = threading.Lock()

+    def inc(self, num=1):
+        """Atomically increment the counter by num and return the new value"""
+        with self._lock:
+            self._value += num
+            return self._value

-@cache
-def is_pin_memory_available() -> bool:
-    from vllm.platforms import current_platform
+    def dec(self, num=1):
+        """Atomically decrement the counter by num and return the new value"""
+        with self._lock:
+            self._value -= num
+            return self._value

-    return current_platform.is_pin_memory_available()
+    @property
+    def value(self):
+        return self._value


-@cache
-def is_uva_available() -> bool:
-    """Check if Unified Virtual Addressing (UVA) is available."""
-    # UVA requires pinned memory.
-    # TODO: Add more requirements for UVA if needed.
-    return is_pin_memory_available()
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)


 # TODO: This function can be removed if transformer_modules classes are
@@ -212,47 +176,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
        enable_trace_function_call(log_path)


-def weak_bind(
-    bound_method: Callable[..., Any],
-) -> Callable[..., None]:
-    """Make an instance method that weakly references
-    its associated instance and no-ops once that
-    instance is collected."""
-    ref = weakref.ref(bound_method.__self__)  # type: ignore[attr-defined]
-    unbound = bound_method.__func__  # type: ignore[attr-defined]
-
-    def weak_bound(*args, **kwargs) -> None:
-        if inst := ref():
-            unbound(inst, *args, **kwargs)
-
-    return weak_bound
-
-
-class AtomicCounter:
-    """An atomic, thread-safe counter"""
-
-    def __init__(self, initial=0):
-        """Initialize a new atomic counter to given initial value"""
-        self._value = initial
-        self._lock = threading.Lock()
-
-    def inc(self, num=1):
-        """Atomically increment the counter by num and return the new value"""
-        with self._lock:
-            self._value += num
-            return self._value
-
-    def dec(self, num=1):
-        """Atomically decrement the counter by num and return the new value"""
-        with self._lock:
-            self._value -= num
-            return self._value
-
-    @property
-    def value(self):
-        return self._value
-
-
 def kill_process_tree(pid: int):
    """
    Kills all descendant processes of the given pid by sending SIGKILL.
@@ -303,13 +226,6 @@ def set_ulimit(target_soft_limit=65535):
            )


-# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
-def get_exception_traceback():
-    etype, value, tb = sys.exc_info()
-    err_str = "".join(traceback.format_exception(etype, value, tb))
-    return err_str
-
-
 def _maybe_force_spawn():
    """Check if we need to force the use of the `spawn` multiprocessing start
    method.
@@ -327,6 +243,8 @@ def _maybe_force_spawn():
        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
        reasons.append("In a Ray actor and can only be spawned")

+    from .platform_utils import cuda_is_initialized, xpu_is_initialized
+
    if cuda_is_initialized():
        reasons.append("CUDA is initialized")
    elif xpu_is_initialized():
@@ -356,55 +274,6 @@ def get_mp_context():
    return multiprocessing.get_context(mp_method)


-def bind_kv_cache(
-    ctx: dict[str, Any],
-    kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
-    shared_kv_cache_layers: dict[str, str] | None = None,
-) -> None:
-    # Bind the kv_cache tensor to Attention modules, similar to
-    # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
-    # Special things handled here:
-    # 1. Some models have non-attention layers, e.g., Jamba
-    # 2. Pipeline parallelism, each rank only has a subset of layers
-    # 3. Encoder attention has no kv cache
-    # 4. Encoder-decoder models, encoder-decoder attention and decoder-only
-    #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
-    #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
-    #    tensor
-    # 5. Some models have attention layers that share kv cache with previous
-    #    layers, this is specified through shared_kv_cache_layers
-    if shared_kv_cache_layers is None:
-        shared_kv_cache_layers = {}
-    from vllm.attention import AttentionType
-    from vllm.model_executor.models.utils import extract_layer_index
-
-    layer_need_kv_cache = [
-        layer_name
-        for layer_name in ctx
-        if (
-            hasattr(ctx[layer_name], "attn_type")
-            and ctx[layer_name].attn_type
-            in (AttentionType.DECODER, AttentionType.ENCODER_DECODER)
-        )
-        and ctx[layer_name].kv_sharing_target_layer_name is None
-    ]
-    layer_index_sorted = sorted(
-        set(extract_layer_index(layer_name) for layer_name in layer_need_kv_cache)
-    )
-    for layer_name in layer_need_kv_cache:
-        kv_cache_idx = layer_index_sorted.index(extract_layer_index(layer_name))
-        forward_ctx = ctx[layer_name]
-        assert len(forward_ctx.kv_cache) == len(kv_cache)
-        for ve, ve_kv_cache in enumerate(kv_cache):
-            forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
-    if shared_kv_cache_layers is not None:
-        for layer_name, target_layer_name in shared_kv_cache_layers.items():
-            assert extract_layer_index(target_layer_name) < extract_layer_index(
-                layer_name
-            ), "v0 doesn't support interleaving kv sharing"
-            ctx[layer_name].kv_cache = ctx[target_layer_name].kv_cache
-
-
 def run_method(
    obj: Any,
    method: str | bytes | Callable,