[Chore]:Extract math and argparse utilities to separate modules (#27188)

Signed-off-by: Yeshwanth Surya <yeshsurya@gmail.com> Signed-off-by: Yeshwanth N <yeshsurya@gmail.com> Signed-off-by: yeshsurya <yeshsurya@gmail.com>

[Chore]:Extract math and argparse utilities to separate modules (#27188)
Signed-off-by: Yeshwanth Surya <yeshsurya@gmail.com> Signed-off-by: Yeshwanth N <yeshsurya@gmail.com> Signed-off-by: yeshsurya <yeshsurya@gmail.com>
71b1c8b6 · Yeshwanth N · GitHub · 8fb7b2fa · 71b1c8b6 · 71b1c8b6
Unverified Commit 71b1c8b6 authored Oct 26, 2025 by Yeshwanth N Committed by GitHub Oct 26, 2025
20 changed files
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -5,7 +5,7 @@ import pytest
 import torch
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 @pytest.mark.parametrize("B", [3, 5])

--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -13,8 +13,8 @@ from tests.kernels.moe.utils import per_token_cast_to_fp8
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
 from vllm.utils.deep_gemm import per_block_cast_to_fp8
+from vllm.utils.math_utils import cdiv
 @pytest.mark.parametrize(

--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -27,7 +27,7 @@ from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
    triton_kernel_moe_forward,
 )
 from vllm.model_executor.layers.utils import shuffle_weight
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 def deshuffle(w: torch.Tensor):

--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
    moe_align_block_size,
 )
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 NUM_TOKENS = [1, 3, 256, 2256, 4096]
 NUM_EXPERTS = [32, 160, 256, 257]

--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExper
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch

--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -45,7 +45,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceDelegate,
 )
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch

--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -8,7 +8,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 fp8_dtype = torch.float8_e4m3fn

--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
 )
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.utils import round_up
 from vllm.utils.deep_gemm import per_block_cast_to_fp8
+from vllm.utils.math_utils import round_up
 def triton_moe(

--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -6,7 +6,7 @@ import torch
 from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.

--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -13,7 +13,7 @@ import torch
 from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 MNK_FACTORS = [
    (1, 256, 128),

--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -18,7 +18,7 @@ from tests.v1.attention.utils import (
 from vllm.attention.backends.registry import _Backend
 from vllm.config import ModelConfig
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,

--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -22,7 +22,7 @@ from vllm import _custom_ops as ops
 from vllm.attention.backends.registry import _Backend
 from vllm.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.config.vllm import set_current_vllm_config
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec

--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -23,7 +23,7 @@ from tests.v1.attention.utils import (
 from vllm import _custom_ops as ops
 from vllm.attention.ops import flashmla
 from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend
 from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks

--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -8,7 +8,7 @@ import pytest
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 def test_prefix_caching_from_cli():

--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -16,7 +16,7 @@ from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,
    subclass_attention_backend,

--- a/vllm/attention/ops/pallas_kv_cache_update.py
+++ b/vllm/attention/ops/pallas_kv_cache_update.py
@@ -7,7 +7,7 @@ import jax
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 def _kv_cache_update_kernel(

--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -6,7 +6,7 @@ import torch
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 FP8_DTYPE = current_platform.fp8_dtype()

--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -58,7 +58,7 @@ except ImportError:
    librosa = PlaceholderModule("librosa")
 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -19,7 +19,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    kStaticTensorScale,
 )
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
 from .fx_utils import is_func

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -82,7 +82,8 @@ from vllm.transformers_utils.config import (
    maybe_override_with_speculators,
 )
 from vllm.transformers_utils.utils import check_gguf_file
-from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
+from vllm.utils import is_in_ray_actor
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
 from vllm.v1.sample.logits_processor import LogitsProcessor