sync v0.15.1

c721b814 · zhuwenwen · d53fe7e5 · c721b814 · c721b814 · d53fe7e5
Commit c721b814 authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,7 +87,6 @@ if TYPE_CHECKING:
    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
    VLLM_PLUGINS: list[str] | None = None
    VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
-    VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
    # Deprecated env variables for profiling, kept for backward compatibility
    # See also vllm/config/profiler.py and `--profiler-config` argument
    VLLM_TORCH_CUDA_PROFILE: str | None = None
@@ -289,11 +288,16 @@ def use_aot_compile() -> bool:
    from vllm.model_executor.layers.batch_invariant import (
        vllm_is_batch_invariant,
    )
+    from vllm.platforms import current_platform
    from vllm.utils.torch_utils import is_torch_equal_or_newer

    default_value = (
        "1"
-        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        if is_torch_equal_or_newer("2.10.0.dev")
+        and not disable_compile_cache()
+        # Disabling AOT_COMPILE for CPU
+        # See: https://github.com/vllm-project/vllm/issues/32033
+        and not current_platform.is_cpu()
        else "0"
    )

@@ -870,13 +874,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
-    # A remote HF repo(s) containing one or more LoRA adapters, which
-    # may be downloaded and leveraged as needed. Only works if plugins
-    # are enabled and VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
-    # Values should be comma separated.
-    "VLLM_LORA_RESOLVER_HF_REPO_LIST": lambda: os.getenv(
-        "VLLM_LORA_RESOLVER_HF_REPO_LIST", None
-    ),
    # Enables torch CUDA profiling if set to 1.
    # Deprecated, see profiler_config.
    "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
@@ -884,7 +881,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Deprecated, see profiler_config.
    "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
    # Enable torch profiler to record shapes if set to 1.
-    # Deprecated, see profiler_config.
+# Deprecated, see profiler_config.
    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
    ),

--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from vllm.logging_utils.access_log_filter import (
-    UvicornAccessLogFilter,
-    create_uvicorn_log_config,
-)
 from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
 from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
@@ -12,8 +8,6 @@ from vllm.logging_utils.log_time import logtime
 __all__ = [
    "NewLineFormatter",
    "ColoredFormatter",
-    "UvicornAccessLogFilter",
-    "create_uvicorn_log_config",
    "lazy",
    "logtime",
 ]
--- a/vllm/logging_utils/access_log_filter.py
+++ b/vllm/logging_utils/access_log_filter.py
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -62,7 +62,6 @@ def _fused_moe_lora_kernel(
    num_experts,
    lora_ids,
    adapter_enabled,
-    max_loras,  # <<< PR2: rename, used for masks when grid axis-2 != max_loras
    # The stride variables represent how much to increase the ptr by when
    # moving by 1 element in a particular dimension. E.g. `stride_am` is
    # how much to increase `a_ptr` by to get the element one row down
@@ -84,7 +83,6 @@ def _fused_moe_lora_kernel(
    num_slice_c: tl.constexpr,
    top_k: tl.constexpr,
    MUL_ROUTED_WEIGHT: tl.constexpr,
-    USE_B_L2_CACHE: tl.constexpr,  # new, enable .ca load for B
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
@@ -106,13 +104,10 @@ def _fused_moe_lora_kernel(
    if moe_enabled == 0:
        # Early exit for the no moe lora case.
        return
-    # The grid's axis-2 dimension is max_loras + 1 to accommodate the -1 sentinel.
-    # This guard ensures we don't access sorted_token_ids / expert_ids /
-    # num_tokens_post_padded beyond their allocated bounds if an invalid
-    # lora_id somehow appears. Although the caller should pass correct
-    # max_loras, defensive programming prevents accidental out-of-bounds.
-    if lora_id >= max_loras:
-        return
+    # The grid size on axis 2 is (max_loras + 1) to handle the no-lora case
+    # (lora_id == -1), but sorted_token_ids and expert_ids are allocated with
+    # shape (max_loras, ...). Use (num_programs - 1) for correct bounds checking.
+    max_loras = tl.num_programs(axis=2) - 1
    grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)

    # calculate pid_m,pid_n
@@ -141,11 +136,10 @@ def _fused_moe_lora_kernel(
    cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
    cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size

-    # remove modulo wrap-around
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
    offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)

-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int32)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
    token_ind = stride_tl * lora_id + offs_token_id
    offs_token = tl.load(
        sorted_token_ids_ptr + token_ind,
@@ -183,11 +177,7 @@ def _fused_moe_lora_kernel(
        # before continuing.
        # pre-fetch lora weight
        # add (offs_bn < N) mask; optional .ca for B
-        b_mask = (offs_k[:, None] < k_remaining) & (offs_bn[None, :] < N)
-        if USE_B_L2_CACHE:
-            b = tl.load(b_ptrs, mask=b_mask, other=0.0, cache_modifier=".ca")
-        else:
-            b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)

        if USE_GDC and not IS_PRIMARY:
            tl.extra.cuda.gdc_wait()
@@ -288,7 +278,6 @@ def _fused_moe_lora_shrink(
        num_experts,
        lora_ids,
        adapter_enabled,
-        lora_a_stacked[0].shape[0],
        qcurr_hidden_states.stride(0),
        qcurr_hidden_states.stride(1),
        w1_lora_a_stacked.stride(0),
@@ -305,7 +294,6 @@ def _fused_moe_lora_shrink(
        num_slice_c=num_slices,
        top_k=1 if mul_routed_weight else top_k_num,
        MUL_ROUTED_WEIGHT=False,
-        USE_B_L2_CACHE=True,  # new
        IS_PRIMARY=True,
        **shrink_config,
    )
@@ -391,7 +379,6 @@ def _fused_moe_lora_expand(
        num_experts,
        lora_ids,
        adapter_enabled,
-        lora_b_stacked[0].shape[0],
        a_intermediate_cache1.stride(0),
        a_intermediate_cache1.stride(1),
        w1_lora_b_stacked.stride(0),
@@ -408,7 +395,6 @@ def _fused_moe_lora_expand(
        num_slice_c=num_slices,
        top_k=1,
        MUL_ROUTED_WEIGHT=mul_routed_weight,
-        USE_B_L2_CACHE=True,  # new
        IS_PRIMARY=False,
        **expand_config,
    )

--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1554,7 +1554,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
    ):
        assert isinstance(prefill, FlashInferPrefillMetadata)
-
        attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
            q=q,
            k=k,

--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=BW200.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=K100_AI_nn.json