Merge tag 'v0.15.0rc1' into v0.15.0rc1-dev

d76fc11e · zhuwenwen · 38166ec4 · 58996f35 · d76fc11e · d76fc11e
Commit d76fc11e authored Jan 28, 2026 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -252,7 +252,7 @@ class Starcoder2Model(nn.Module):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None,
        inputs_embeds: torch.Tensor | None = None,
@@ -336,7 +336,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -354,7 +354,7 @@ class Step3TextModel(nn.Module):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
@@ -419,7 +419,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -350,7 +350,7 @@ class Base(
        # vLLM does not support encoder-decoder models, so if any encoder layer is
        # found in a text only model, we assume the whole model is an encoder model
        if has_encoder(self.model) and not is_multimodal(self.config):
-            self.check_version("5.0.0.dev0", "encoder models support")
+            self.check_version("5.0.0", "encoder models support")
            attn_type = AttentionType.ENCODER_ONLY
        else:
            attn_type = AttentionType.DECODER
@@ -502,7 +502,7 @@ class Base(
            )

    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
-        self.check_version("5.0.0.dev0", "Eagle3 support")
+        self.check_version("5.0.0", "Eagle3 support")
        from transformers.utils.generic import OutputRecorder

        # The default value in PreTrainedModel is None

--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -118,7 +118,7 @@ direct_register_custom_op(

 class MoEMixin(MixtureOfExperts):
    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
-        self.check_version("5.0.0.dev0", "MoE models support")
+        self.check_version("5.0.0", "MoE models support")
        # Skip MixtureOfExperts.__init__ and call the next class in MRO
        super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)


--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: torch.Tensor | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration(

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/voxtral_streaming.py
+++ b/vllm/model_executor/models/voxtral_streaming.py
@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/vllm/model_executor/models/whisper_causal.py
+++ b/vllm/model_executor/models/whisper_causal.py
@@ -105,6 +105,7 @@ def create_whisper_attention_backend_with_block_pooling(
 ) -> type[AttentionBackend]:
    prefix = "WhisperCausalAttentionWithBlockPooling_"
    underlying_builder = underlying_attn_backend.get_builder_cls()
+    underlying_impl = underlying_attn_backend.get_impl_cls()

    class WhisperCausalAttentionWithBlockPoolingBuilder(underlying_builder):  # type: ignore
        def __init__(
@@ -151,6 +152,43 @@ def create_whisper_attention_backend_with_block_pooling(
                common_prefix_len, new_common_attn_metadata, fast_build
            )

+    # NOTE: We need a custom impl so we can use the transformed slot_mapping
+    # computed by `WhisperCausalAttentionWithBlockPoolingBuilder` instead of
+    # the one from `forward_context.slot_mapping` (gpu_model_runner).
+    # This follows the same pattern as CrossAttentionImpl.
+    class WhisperCausalAttentionWithBlockPoolingImpl(underlying_impl):  # type: ignore[valid-type,misc]
+        def forward(
+            self,
+            layer: torch.nn.Module,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            kv_cache: torch.Tensor,
+            attn_metadata: AttentionMetadata,
+            output: torch.Tensor | None = None,
+            output_scale: torch.Tensor | None = None,
+            output_block_scale: torch.Tensor | None = None,
+        ) -> torch.Tensor:
+            if (
+                not underlying_attn_backend.forward_includes_kv_cache_update
+                and attn_metadata is not None
+            ):
+                self.do_kv_cache_update(
+                    layer, key, value, kv_cache, attn_metadata.slot_mapping
+                )
+
+            return super().forward(
+                layer,
+                query,
+                key,
+                value,
+                kv_cache,
+                attn_metadata,
+                output,
+                output_scale,
+                output_block_scale,
+            )
+
    if not issubclass(underlying_attn_backend, FlashAttentionBackend):
        raise NotImplementedError(
            f"{underlying_attn_backend} is not yet supported."
@@ -163,6 +201,7 @@ def create_whisper_attention_backend_with_block_pooling(
        attention_backend_cls=underlying_attn_backend,
        overrides={
            "get_builder_cls": lambda: WhisperCausalAttentionWithBlockPoolingBuilder,
+            "get_impl_cls": lambda: WhisperCausalAttentionWithBlockPoolingImpl,
            "get_kv_cache_shape": lambda num_blocks,
            block_size,
            num_kv_heads,
@@ -175,6 +214,7 @@ def create_whisper_attention_backend_with_block_pooling(
                num_kv_heads // block_pool_size,
                head_size,
            ),  # TODO: generalize to other backends
+            "forward_includes_kv_cache_update": True,
        },
    )


--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs: Any,

--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod
-from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
    TritonOrDeepGemmExperts,
 )
@@ -169,9 +168,10 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
        # modular kernels could invoke deep_gemm_moe_fp8
        return True

-    mk: FusedMoEModularKernel = module.quant_method.fused_experts
    # Further check if the ModularKernel implementation uses the DeepGemmExperts
-    return isinstance(mk.fused_experts, (DeepGemmExperts, TritonOrDeepGemmExperts))
+    return isinstance(
+        module.quant_method.moe_mk, (DeepGemmExperts, TritonOrDeepGemmExperts)
+    )


 FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -20,6 +20,7 @@ from typing import (
 )

 import numpy as np
+from PIL.Image import Image
 from typing_extensions import NotRequired, TypeVar

 from vllm.utils.collection_utils import full_groupby, is_list_of
@@ -29,7 +30,6 @@ from vllm.utils.jsontree import json_map_leaves
 if TYPE_CHECKING:
    import torch
    import torch.types
-    from PIL.Image import Image
    from transformers.feature_extraction_utils import BatchFeature

    from .media import MediaWithBytes
@@ -105,6 +105,28 @@ The number of data items allowed per modality is restricted by
 """


+class VisionChunkImage(TypedDict):
+    """Represents an image wrapped as a vision chunk."""
+
+    type: Literal["image"]
+    image: Image
+    uuid: str | None
+
+
+class VisionChunkVideo(TypedDict):
+    """Represents a video chunk with metadata."""
+
+    type: Literal["video_chunk"]
+    video_chunk: list[Image]
+    uuid: str | None
+    prompt: str
+    video_idx: int
+
+
+VisionChunk = VisionChunkImage | VisionChunkVideo
+"""A vision chunk is either an image or a video chunk."""
+
+
 @final
 class MultiModalDataBuiltins(TypedDict, total=False):
    """Type annotations for modality types predefined by vLLM."""
@@ -118,6 +140,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
    audio: ModalityData[AudioItem]
    """The input audio(s)."""

+    vision_chunk: ModalityData[VisionChunk]
+    """The input visual atom(s) - unified modality for images and video chunks."""
+

 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """

--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -384,6 +384,13 @@ class VideoEmbeddingItems(EmbeddingItems):
        super().__init__(data, "video", expected_hidden_size)


+class VisionChunkProcessorItems(ProcessorBatchItems[Any]):
+    """Processor items for vision chunks (unified image and video chunks)."""
+
+    def __init__(self, data: Sequence[Any]) -> None:
+        super().__init__(data, "vision_chunk")
+
+
 _D = TypeVar("_D", bound=ModalityDataItems[Any, Any])


@@ -652,11 +659,23 @@ class MultiModalDataParser:

        return VideoProcessorItems(new_videos, metadata=metadata_lst)

+    def _parse_vision_chunk_data(
+        self,
+        data: ModalityData[Any],
+    ) -> ModalityDataItems[Any, Any] | None:
+        """Parse vision chunk data (unified image and video chunks)."""
+        if data is None or self._is_empty(data):
+            return None
+        if self.is_embeddings(data):
+            raise ValueError("Do not support embedding data for vision_chunk right now")
+        return VisionChunkProcessorItems(data)
+
    def _get_subparsers(self) -> Mapping[str, ModalityDataParser]:
        return {
            "audio": self._parse_audio_data,
            "image": self._parse_image_data,
            "video": self._parse_video_data,
+            "vision_chunk": self._parse_vision_chunk_data,
        }

    def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems:

--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -235,6 +235,27 @@ class VideoLoader:
 VIDEO_LOADER_REGISTRY = ExtensionManager()


+@VIDEO_LOADER_REGISTRY.register("identity")
+class IdentityVideoLoader(VideoLoader):
+    """IdentityVideoLoader returns raw video bytes without decoding.
+
+    This allows the model processor to handle video decoding and
+    is required for models like Kimi-K2.5 that need custom video chunk splitting.
+
+    NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
+    to opencv before release if needed.
+    """
+
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        **kwargs: Any,
+    ) -> tuple[Any, Any]:
+        return data, None
+
+
 @VIDEO_LOADER_REGISTRY.register("opencv")
 class OpenCVVideoBackend(VideoLoader):
    def get_cv2_video_api(self):

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -599,7 +599,8 @@ class RocmPlatform(Platform):
        cls, device: torch.types.Device | None = None
    ) -> float:
        torch.cuda.reset_peak_memory_stats(device)
-        # return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(device)[0]
+        # free_mem, total_mem = torch.cuda.mem_get_info(device)
+        # return total_mem - free_mem
        return torch.cuda.max_memory_allocated(device)

    @classmethod

--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
        self, base_model_name: str, lora_name: str
    ) -> LoRARequest | None:
        lora_path = os.path.join(self.lora_cache_dir, lora_name)
+        maybe_lora_request = await self._get_lora_req_from_path(
+            lora_name, lora_path, base_model_name
+        )
+        return maybe_lora_request
+
+    async def _get_lora_req_from_path(
+        self, lora_name: str, lora_path: str, base_model_name: str
+    ) -> LoRARequest | None:
+        """Builds a LoraRequest pointing to the lora path if it's a valid
+        LoRA adapter and has a matching base_model_name.
+        """
        if os.path.exists(lora_path):
-            adapter_config_path = os.path.join(
-                self.lora_cache_dir, lora_name, "adapter_config.json"
-            )
+            adapter_config_path = os.path.join(lora_path, "adapter_config.json")
+
            if os.path.exists(adapter_config_path):
                with open(adapter_config_path) as file:
                    adapter_config = json.load(file)

--- a/vllm/plugins/lora_resolvers/hf_hub_resolver.py
+++ b/vllm/plugins/lora_resolvers/hf_hub_resolver.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+
+from huggingface_hub import HfApi, snapshot_download
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolverRegistry
+from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
+
+logger = init_logger(__name__)
+
+
+class HfHubResolver(FilesystemResolver):
+    def __init__(self, repo_list: list[str]):
+        logger.warning(
+            "LoRA is allowing resolution from the following repositories on"
+            " HF Hub: %s please note that allowing remote downloads"
+            " is not secure, and that this plugin is not intended for use in"
+            " production environments.",
+            repo_list,
+        )
+
+        self.repo_list: list[str] = repo_list
+        self.adapter_dirs: dict[str, set[str]] = {}
+
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
+        """Resolves potential LoRA requests in a remote repo on HF Hub.
+        This is effectively the same behavior as the filesystem resolver, but
+        with a snapshot_download on dirs containing an adapter config prior
+        to inspecting the cached dir to build a potential LoRA
+        request.
+        """
+        # If a LoRA name begins with the repository name, it's disambiguated
+        maybe_repo = await self._resolve_repo(lora_name)
+
+        # If we haven't inspected this repo before, save available adapter dirs
+        if maybe_repo is not None and maybe_repo not in self.adapter_dirs:
+            self.adapter_dirs[maybe_repo] = await self._get_adapter_dirs(maybe_repo)
+
+        maybe_subpath = await self._resolve_repo_subpath(lora_name, maybe_repo)
+
+        if maybe_repo is None or maybe_subpath is None:
+            return None
+
+        repo_path = await asyncio.to_thread(
+            snapshot_download,
+            repo_id=maybe_repo,
+            allow_patterns=f"{maybe_subpath}/*" if maybe_subpath != "." else "*",
+        )
+
+        lora_path = os.path.join(repo_path, maybe_subpath)
+        maybe_lora_request = await self._get_lora_req_from_path(
+            lora_name, lora_path, base_model_name
+        )
+        return maybe_lora_request
+
+    async def _resolve_repo(self, lora_name: str) -> str | None:
+        """Given a fully qualified path to a LoRA with respect to its HF Hub
+        repo, match the right repo to potentially download from if one exists.
+
+        Args:
+            lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
+                match on <org>/<repo> (if it contains an adapter directly) or
+                <org>/<repo>/ if it may have one in subdirs.
+        """
+        for potential_repo in self.repo_list:
+            if lora_name.startswith(potential_repo) and (
+                len(lora_name) == len(potential_repo)
+                or lora_name[len(potential_repo)] == "/"
+            ):
+                return potential_repo
+        return None
+
+    async def _resolve_repo_subpath(
+        self, lora_name: str, maybe_repo: str | None
+    ) -> str | None:
+        """Given the fully qualified path of the LoRA with respect to the HF
+        Repo, get the subpath to download from assuming it's actually got an
+        adapter in it.
+
+        Args:
+            lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
+            maybe_repo: Path to the repo to match against if one exists.
+        """
+        if maybe_repo is None:
+            return None
+        repo_len = len(maybe_repo)
+        if lora_name == maybe_repo or (
+            len(lora_name) == repo_len + 1 and lora_name[-1] == "/"
+        ):
+            # Resolves to the root of the directory
+            adapter_dir = "."
+        else:
+            # It's a subpath; removing trailing slashes if there are any
+            adapter_dir = lora_name[repo_len + 1 :].rstrip("/")
+
+        # Only download if the directory actually contains an adapter
+        is_adapter = adapter_dir in self.adapter_dirs[maybe_repo]
+        return adapter_dir if is_adapter else None
+
+    async def _get_adapter_dirs(self, repo_name: str) -> set[str]:
+        """Gets the subpaths within a HF repo that contain an adapter config.
+
+        Args:
+            repo_name: Name of the HF hub repo to inspect.
+        """
+        repo_files = await asyncio.to_thread(HfApi().list_repo_files, repo_id=repo_name)
+        adapter_dirs = {
+            os.path.dirname(name)
+            for name in repo_files
+            if name.endswith("adapter_config.json")
+        }
+        if "adapter_config.json" in repo_files:
+            adapter_dirs.add(".")
+        return adapter_dirs
+
+
+def register_hf_hub_resolver():
+    """Register the Hf hub LoRA Resolver with vLLM"""
+
+    hf_repo_list = envs.VLLM_LORA_RESOLVER_HF_REPO_LIST
+    is_enabled = (
+        envs.VLLM_PLUGINS is not None and "lora_hf_hub_resolver" in envs.VLLM_PLUGINS
+    )
+    if hf_repo_list:
+        if not is_enabled:
+            logger.warning(
+                "It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
+                "lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
+                " enable this resolver directly in VLLM_PLUGINS to use it "
+                " because it allows remote downloads."
+            )
+        else:
+            hf_hub_resolver = HfHubResolver(hf_repo_list.split(","))
+            LoRAResolverRegistry.register_resolver("Hf Hub Resolver", hf_hub_resolver)
+
+    return
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -54,8 +54,8 @@ _REASONING_PARSERS_TO_REGISTER = {
        "HunyuanA13BReasoningParser",
    ),
    "kimi_k2": (
-        "deepseek_r1_reasoning_parser",
-        "DeepSeekR1ReasoningParser",
+        "kimi_k2_reasoning_parser",
+        "KimiK2ReasoningParser",
    ),
    "minimax_m2": (
        "minimax_m2_reasoning_parser",

--- a/vllm/reasoning/kimi_k2_reasoning_parser.py
+++ b/vllm/reasoning/kimi_k2_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+
+from .identity_reasoning_parser import IdentityReasoningParser
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.chat_completion.protocol import (
+        ChatCompletionRequest,
+    )
+else:
+    ChatCompletionRequest = Any
+
+
+logger = init_logger(__name__)
+
+
+class KimiK2ReasoningParser(ReasoningParser):
+    """
+    Kimi K2 parser that delegates to either DeepSeekR1ReasoningParser or
+    IdentityReasoningParser based on `thinking` and `separate_reasoning`.
+
+    Unlike DeepSeekV3ReasoningParser which defaults to NOT thinking,
+    KimiK2ReasoningParser defaults to thinking mode (uses DeepSeekR1ReasoningParser).
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
+        # Key difference: default to True instead of False
+        thinking = bool(chat_kwargs.pop("thinking", True))
+
+        if thinking:
+            self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return self._parser.is_reasoning_end(input_ids)
+
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return self._parser.extract_content_ids(input_ids)
+
+    def extract_reasoning(
+        self, model_output: str, request: "ChatCompletionRequest"
+    ) -> tuple[str | None, str | None]:
+        return self._parser.extract_reasoning(model_output, request)
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        return self._parser.extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )