[Docs] Lazy import gguf (#20785)

Signed-off-by: simon-mo <simon.mo@hey.com>

[Docs] Lazy import gguf (#20785)
Signed-off-by: simon-mo <simon.mo@hey.com>
b854321f · Simon Mo · GitHub · 5b6fe23d · b854321f · b854321f
Unverified Commit b854321f authored Jul 10, 2025 by Simon Mo Committed by GitHub Jul 10, 2025
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 2 deletions

vllm/entrypoints/score_utils.py vllm/entrypoints/score_utils.py +5 -1

vllm/model_executor/model_loader/weight_utils.py vllm/model_executor/model_loader/weight_utils.py +5 -1

No files found.
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -11,7 +11,6 @@ from vllm.entrypoints.chat_utils import (
    ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
    MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
 from vllm.inputs import TokensPrompt
-from vllm.model_executor.model_loader import get_model_cls
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
@@ -140,6 +139,8 @@ def apply_score_template(
    prompt_1: str,
    prompt_2: str,
 ) -> str:
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
    model = get_model_cls(model_config)
    if supports_score_template(model):
@@ -162,6 +163,9 @@ def post_process_tokens(
    Note:
        This is an in-place operation.
    """
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
    model = get_model_cls(model_config)
    if supports_score_template(model):
        model.post_process_tokens(prompt)

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -14,7 +14,6 @@ from pathlib import Path
 from typing import Any, Callable, Optional, Union
 import filelock
-import gguf
 import huggingface_hub.constants
 import numpy as np
 import torch
@@ -40,6 +39,11 @@ except (ImportError, OSError):
    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
        "SafetensorsStreamer")
+try:
+    import gguf
+except ImportError:
+    gguf = PlaceholderModule("gguf")
 try:
    from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 except ImportError: