Unverified Commit 71b1c8b6 authored by Yeshwanth N's avatar Yeshwanth N Committed by GitHub
Browse files

[Chore]:Extract math and argparse utilities to separate modules (#27188)


Signed-off-by: default avatarYeshwanth Surya <yeshsurya@gmail.com>
Signed-off-by: default avatarYeshwanth N <yeshsurya@gmail.com>
Signed-off-by: default avataryeshsurya <yeshsurya@gmail.com>
parent 8fb7b2fa
......@@ -5,7 +5,7 @@ import pytest
import torch
from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
@pytest.mark.parametrize("B", [3, 5])
......
......@@ -13,8 +13,8 @@ from tests.kernels.moe.utils import per_token_cast_to_fp8
from tests.kernels.utils import baseline_scaled_mm
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import cdiv
@pytest.mark.parametrize(
......
......@@ -27,7 +27,7 @@ from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
triton_kernel_moe_forward,
)
from vllm.model_executor.layers.utils import shuffle_weight
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
def deshuffle(w: torch.Tensor):
......
......@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
NUM_TOKENS = [1, 3, 256, 2256, 4096]
NUM_EXPERTS = [32, 160, 256, 257]
......
......@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExper
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
......
......@@ -45,7 +45,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
......
......@@ -8,7 +8,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
fp8_dtype = torch.float8_e4m3fn
......
......@@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
)
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils import round_up
from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import round_up
def triton_moe(
......
......@@ -6,7 +6,7 @@ import torch
from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
......
......@@ -13,7 +13,7 @@ import torch
from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
MNK_FACTORS = [
(1, 256, 128),
......
......@@ -18,7 +18,7 @@ from tests.v1.attention.utils import (
from vllm.attention.backends.registry import _Backend
from vllm.config import ModelConfig
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
......
......@@ -22,7 +22,7 @@ from vllm import _custom_ops as ops
from vllm.attention.backends.registry import _Backend
from vllm.attention.ops.flashmla import is_flashmla_dense_supported
from vllm.config.vllm import set_current_vllm_config
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec
......
......@@ -23,7 +23,7 @@ from tests.v1.attention.utils import (
from vllm import _custom_ops as ops
from vllm.attention.ops import flashmla
from vllm.model_executor.layers.linear import ColumnParallelLinear
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend
from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks
......
......@@ -8,7 +8,7 @@ import pytest
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def test_prefix_caching_from_cli():
......
......@@ -16,7 +16,7 @@ from vllm.attention.layer import Attention
from vllm.attention.selector import get_attn_backend
from vllm.config import CacheConfig, VllmConfig
from vllm.logger import init_logger
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
subclass_attention_backend,
......
......@@ -7,7 +7,7 @@ import jax
from jax.experimental import pallas as pl
from jax.experimental.pallas import tpu as pltpu
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
def _kv_cache_update_kernel(
......
......@@ -6,7 +6,7 @@ import torch
from vllm.attention.ops.paged_attn import PagedAttention
from vllm.platforms import current_platform
from vllm.utils import cdiv
from vllm.utils.math_utils import cdiv
FP8_DTYPE = current_platform.fp8_dtype()
......
......@@ -58,7 +58,7 @@ except ImportError:
librosa = PlaceholderModule("librosa")
try:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
......
......@@ -19,7 +19,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
kStaticTensorScale,
)
from vllm.platforms import current_platform
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
from .fx_utils import is_func
......
......@@ -82,7 +82,8 @@ from vllm.transformers_utils.config import (
maybe_override_with_speculators,
)
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
from vllm.utils import is_in_ray_actor
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_ip
from vllm.v1.sample.logits_processor import LogitsProcessor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment