vlm: remove redundant d2h movement of mm feature tensors (#9987)

Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com>

vlm: remove redundant d2h movement of mm feature tensors (#9987)
Co-authored-by: Xiang (Kevin) Li <lik@nvidia.com>
de28f8e7 · Kevin Xiang Li · GitHub · 56405076 · de28f8e7 · de28f8e7
Unverified Commit de28f8e7 authored Sep 17, 2025 by Kevin Xiang Li Committed by GitHub Sep 17, 2025
Showing with 13 additions and 6 deletions

python/sglang/srt/multimodal/processors/base_processor.py python/sglang/srt/multimodal/processors/base_processor.py +7 -6

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +6 -0

No files found.
--- a/python/sglang/srt/multimodal/processors/base_processor.py
+++ b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -241,12 +241,13 @@ class BaseMultimodalProcessor(ABC):
            return_tensors="pt",
            **kwargs,
        )
-        # move feature tensors to cpu
-        for feature_name in self.FEATURE_NAMES:
-            if feature_name in result and isinstance(
-                result[feature_name], torch.Tensor
-            ):
-                result[feature_name] = result[feature_name].to("cpu")
+        if not self.server_args.keep_mm_feature_on_device:
+            # move feature tensors to cpu
+            for feature_name in self.FEATURE_NAMES:
+                if feature_name in result and isinstance(
+                    result[feature_name], torch.Tensor
+                ):
+                    result[feature_name] = result[feature_name].to("cpu")

        return result


--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -381,6 +381,7 @@ class ServerArgs:
    disable_shared_experts_fusion: bool = False
    disable_chunked_prefix_cache: bool = False
    disable_fast_image_processor: bool = False
+    keep_mm_feature_on_device: bool = False
    enable_return_hidden_states: bool = False
    scheduler_recv_interval: int = 1
    numa_node: Optional[List[int]] = None
@@ -2213,6 +2214,11 @@ class ServerArgs:
            action="store_true",
            help="Adopt base image processor instead of fast image processor.",
        )
+        parser.add_argument(
+            "--keep-mm-feature-on-device",
+            action="store_true",
+            help="Keep multimodal feature tensors on device after processing to save D2H copy.",
+        )
        parser.add_argument(
            "--enable-return-hidden-states",
            action="store_true",