chore: upgrade sglang dependency from 0.5.9 to 0.5.10.post1 (#7997)

Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

chore: upgrade sglang dependency from 0.5.9 to 0.5.10.post1 (#7997)
Signed-off-by: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
e48de6aa · Krishnan Prashanth · GitHub · edce7d3d · e48de6aa · e48de6aa
Unverified Commit e48de6aa authored Apr 13, 2026 by Krishnan Prashanth Committed by GitHub Apr 13, 2026
10 changed files
--- a/components/src/dynamo/sglang/CLAUDE.md
+++ b/components/src/dynamo/sglang/CLAUDE.md
@@ -273,8 +273,8 @@ text-to-video-diffusion.sh  # 1-2 GPUs - Text-to-video (Wan2.1)
 - **engine=None**: Multimodal encode worker passes `engine=None` to
  BaseWorkerHandler. Any code in the base class that touches engine must guard with
  `if engine is not None`.
- **GenerationResult is a dataclass**: SGLang 0.5.9 changed `DiffGenerator.generate()`
+- **GenerationResult is a dataclass**: SGLang `DiffGenerator.generate()`
-  to return `GenerationResult` (not a dict). Use `result.frames`, not `result["frames"]`.
+  returns `GenerationResult` (not a dict). Use `result.frames`, not `result["frames"]`.
 - **output_modalities default**: Global default is `["text"]`. Image/video diffusion
  workers must override to `["image"]`/`["video"]` or the Rust registration path tries
  to load `config.json` (which doesn't exist for diffusers models).

--- a/components/src/dynamo/sglang/_compat.py
+++ b/components/src/dynamo/sglang/_compat.py
@@ -22,24 +22,28 @@ from typing import Any
 logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Network utilities: NetworkAddress, get_local_ip_auto, get_zmq_socket
+#
+# 0.5.10+: sglang.srt.utils.network (canonical)
+# 0.5.9:   sglang.srt.utils (get_local_ip_auto, get_zmq_socket only;
+#           NetworkAddress did not exist)
+# ---------------------------------------------------------------------------
 try:
    from sglang.srt.utils.network import (  # noqa: F401
        NetworkAddress,
        get_local_ip_auto,
        get_zmq_socket,
    )
-    _SGLANG_HAS_NETWORK_MODULE = True
 except ImportError:
-    # Fallback for sglang <= 0.5.9. Remove when min supported version is 0.6.0+
+    # Fallback for sglang 0.5.9. Remove when min supported version is 0.5.10+
    from sglang.srt.utils import (  # type: ignore[no-redef]  # noqa: F401
        get_local_ip_auto,
        get_zmq_socket,
    )
-    _SGLANG_HAS_NETWORK_MODULE = False
    logger.info(
-        "sglang.srt.utils.network not found (sglang <= 0.5.9); "
+        "sglang.srt.utils.network not found (sglang 0.5.9); "
        "using compatibility shim for NetworkAddress"
    )
@@ -99,6 +103,36 @@ except ImportError:
            return f"tcp://{self.host}:{self.port}"
+# ---------------------------------------------------------------------------
+# MMEncoder._encode() adapter
+#
+# 0.5.10+: _encode(mm_items, modality) -> (grid_dim, embedding, aux_data)
+# 0.5.9:   _encode(mm_items)           -> (grid_dim, embedding)
+#
+# Imports are deferred to avoid pulling sgl_kernel (CUDA-only) at module
+# level, which breaks test collection on arm64 CPU-only CI nodes.
+# ---------------------------------------------------------------------------
+async def mm_encode(encoder: Any, mm_items: Any, modality: Any) -> tuple:
+    """Version-safe wrapper around MMEncoder._encode().
+    Always returns (grid_dim, embedding, aux_data). On sglang 0.5.9
+    _encode takes no modality arg and returns a 2-tuple; on 0.5.10+ it
+    takes modality and returns a 3-tuple. We try the new signature first
+    and fall back to the old one.
+    """
+    try:
+        result = await encoder._encode(mm_items, modality)
+    except TypeError:
+        # sglang 0.5.9: _encode(mm_items) -> (grid_dim, embedding)
+        result = await encoder._encode(mm_items)
+    if len(result) == 2:
+        return (*result, None)
+    return result
 def enable_disjoint_streaming_output(server_args: Any) -> None:
    """
    Enable SGLang's disjoint streaming output across ServerArgs field renames.
@@ -137,5 +171,5 @@ __all__ = [
    "enable_disjoint_streaming_output",
    "get_local_ip_auto",
    "get_zmq_socket",
-    "_SGLANG_HAS_NETWORK_MODULE",
+    "mm_encode",
 ]
--- a/components/src/dynamo/sglang/publisher.py
+++ b/components/src/dynamo/sglang/publisher.py
@@ -16,7 +16,6 @@ from dynamo.sglang._compat import NetworkAddress, get_local_ip_auto, get_zmq_soc
 if TYPE_CHECKING:
    from prometheus_client import CollectorRegistry
-    from sglang.srt.managers.scheduler_metrics_mixin import KvMetrics
 from dynamo.common.utils.prometheus import (
    LLMBackendMetrics,
@@ -131,9 +130,8 @@ class DynamoSglangPublisher:
        while self._running:
            try:
                # Receive KvMetrics object from SGLang scheduler via ZMQ
-                # KvMetrics class: sglang/srt/managers/scheduler_metrics_mixin.py lines 45-54
+                # KvMetrics class: sglang/srt/observability/scheduler_metrics_mixin.py
-                # Sent from: sglang/srt/managers/scheduler_metrics_mixin.py lines 482-499 (_emit_kv_metrics)
+                kv_metrics = await self._sock.recv_pyobj()
-                kv_metrics: KvMetrics = await self._sock.recv_pyobj()
                dp_rank = (
                    kv_metrics.data_parallel_rank
                    if kv_metrics.data_parallel_rank is not None

--- a/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
@@ -12,8 +12,10 @@ import torch
 # MMEncoder chain imports compiled CUDA ops; may fail in CPU-only environments.
 try:
    from sglang.srt.disaggregation.encode_server import MMEncoder
+    from sglang.srt.managers.schedule_batch import Modality
 except (ImportError, OSError):
    MMEncoder = None  # type: ignore[assignment]
+    Modality = None  # type: ignore[assignment]
 from sglang.srt.parser.conversation import chat_templates
 from transformers import AutoTokenizer
@@ -24,6 +26,7 @@ from dynamo.common.memory.multimodal_embedding_cache_manager import (
 )
 from dynamo.common.multimodal import EMBEDDING_SENDER_FACTORIES
 from dynamo.common.utils import nvtx_utils as _nvtx
+from dynamo.sglang._compat import mm_encode
 from dynamo.sglang.args import Config
 from dynamo.sglang.protocol import (
    MultiModalGroup,
@@ -215,7 +218,9 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s
        # SGLang's _encode outputs are already on CPU; use CPU as target for consistency
        target_device = torch.device("cpu")
        if uncached_urls:
-            grid_dim, new_embeddings = await self.encoder._encode(uncached_urls)
+            grid_dim, new_embeddings, _aux = await mm_encode(
+                self.encoder, uncached_urls, Modality.IMAGE
+            )
            # Verify SGLang output is on CPU as expected
            if new_embeddings.device != target_device:
                logger.warning(
@@ -335,9 +340,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s
                        precomputed_embeddings,
                    ) = await self._encode_with_cache(image_urls)
                else:
-                    image_grid_dim, precomputed_embeddings = await self.encoder._encode(
+                    (
-                        image_urls
+                        image_grid_dim,
-                    )
+                        precomputed_embeddings,
+                        _aux,
+                    ) = await mm_encode(self.encoder, image_urls, Modality.IMAGE)
            image_grid_thw_list = (
                image_grid_dim.tolist()

--- a/components/src/dynamo/sglang/tests/test_sglang_multimodal_embedding_cache.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_multimodal_embedding_cache.py
@@ -42,6 +42,8 @@ async def test_encode_with_cache_partial_hit_and_reuse(
    cache_handler: MultimodalEncodeWorkerHandler,
 ) -> None:
    """Partial-hit should encode only misses and preserve URL order in output."""
+    from sglang.srt.managers.schedule_batch import Modality
    urls = [
        "http://example.com/a.jpg",
        "http://example.com/b.jpg",
@@ -60,12 +62,15 @@ async def test_encode_with_cache_partial_hit_and_reuse(
    cache_handler.encoder._encode.return_value = (
        torch.tensor([[1, 2, 4], [1, 2, 2]]),
        encoded,
+        None,  # aux_data (unused by cache path)
    )
    grid, full_embeddings = await cache_handler._encode_with_cache(urls)
    # Encoder called once for uncached URLs only
-    cache_handler.encoder._encode.assert_awaited_once_with([urls[0], urls[2]])
+    cache_handler.encoder._encode.assert_awaited_once_with(
+        [urls[0], urls[2]], Modality.IMAGE
+    )
    # Order should match original URL order: a(8), b(4 cached), c(4)
    assert grid.tolist() == [[1, 2, 4], [1, 2, 2], [1, 2, 2]]

--- a/container/compliance/README.md
+++ b/container/compliance/README.md
@@ -128,8 +128,8 @@ python container/compliance/process_results.py \
 |-----------|------|------------|
 | `vllm` | 12.9 | `nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04` |
 | `vllm` | 13.0 | `nvcr.io/nvidia/cuda:13.0.2-runtime-ubuntu24.04` |
-| `sglang` | 12.9 | `lmsysorg/sglang:v0.5.9-runtime` |
+| `sglang` | 12.9 | `lmsysorg/sglang:v0.5.10.post1-runtime` |
-| `sglang` | 13.0 | `lmsysorg/sglang:v0.5.9-cu130-runtime` |
+| `sglang` | 13.0 | `lmsysorg/sglang:v0.5.10.post1-cu130-runtime` |
 | `trtllm` | 13.1 | `nvcr.io/nvidia/cuda-dl-base:25.12-cuda13.1-runtime-ubuntu24.04` |
 | `dynamo` frontend | — | `nvcr.io/nvidia/base/ubuntu:noble-20250619` |

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -79,14 +79,14 @@ sglang:
    base_image: nvcr.io/nvidia/cuda-dl-base
    runtime_image: lmsysorg/sglang
    base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
-    runtime_image_tag: v0.5.9-runtime
+    runtime_image_tag: v0.5.10.post1-runtime
  cuda13.0:
    base_image: nvcr.io/nvidia/cuda-dl-base
    runtime_image: lmsysorg/sglang
    base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
-    runtime_image_tag: v0.5.9-cu130-runtime
+    runtime_image_tag: v0.5.10.post1-cu130-runtime
  nixl_ref: 0.10.0
-  enable_media_ffmpeg: "false"
+  enable_media_ffmpeg: "true"
  enable_gpu_memory_service: "true"
  enable_kvbm: "false"

--- a/docs/reference/support-matrix.md
+++ b/docs/reference/support-matrix.md
@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna
 | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
 | :--- | :--- | :--- | :--- | :--- |
-| **main (ToT)** | `0.5.9` | `1.3.0rc11` | `0.19.0` | `0.10.1` |
+| **main (ToT)** | `0.5.10.post1` | `1.3.0rc11` | `0.19.0` | `0.10.1` |
 | **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
 | **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
 | **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ vllm = [
 sglang = [
    "uvloop",
-    "sglang[diffusion]==0.5.9",
+    "sglang[diffusion]==0.5.10.post1",
    "nixl[cu12]<=0.10.1",
    "cupy-cuda12x>=13.0.0",
 ]
@@ -181,6 +181,11 @@ filterwarnings = [
    "error",
    # CUDA deprecation warnings from tensorrt_llm
    "ignore:.*cuda*:DeprecationWarning",
+    # cuda.cudart/cuda.nvrtc deprecated in favor of cuda.bindings.* (cuda-python >=13)
+    # Triggered by flashinfer 0.6.7+ during import of comm/mnnvl.py
+    "ignore:The cuda\\..*module is deprecated:FutureWarning",
+    # SGLang GGUF quantization emits UserWarning on non-CUDA platforms (arm64 CPU-only CI)
+    "ignore:Only CUDA.*support GGUF quantization:UserWarning",
    # protobuf C extension warning
    "ignore:.*PyType_Spec.*custom tp_new.*:DeprecationWarning",
    # unclosed socket/event loop warnings

--- a/tests/router/test_router_e2e_with_sglang.py
+++ b/tests/router/test_router_e2e_with_sglang.py
@@ -146,9 +146,13 @@ class SGLangProcess(ManagedEngineProcessMixin):
                str(page_size),
            ]
-            # Disable CUDA graphs for faster startup & lower memory
+            # Disable CUDA graphs for faster startup & lower memory.
+            # sglang 0.5.10+ has piecewise CUDA graphs (separate flag) that
+            # consume ~7 GB during capture — must also be disabled for
+            # multi-worker same-GPU tests to avoid OOM.
            if disable_cuda_graph:
                command.append("--disable-cuda-graph")
+                command.append("--disable-piecewise-cuda-graph")
            # Limit VRAM allocation (required for multi-worker on same GPU)
            if mem_fraction_static is not None: