Unverified Commit e48de6aa authored by Krishnan Prashanth's avatar Krishnan Prashanth Committed by GitHub
Browse files

chore: upgrade sglang dependency from 0.5.9 to 0.5.10.post1 (#7997)


Signed-off-by: default avatarKrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent edce7d3d
...@@ -273,8 +273,8 @@ text-to-video-diffusion.sh # 1-2 GPUs - Text-to-video (Wan2.1) ...@@ -273,8 +273,8 @@ text-to-video-diffusion.sh # 1-2 GPUs - Text-to-video (Wan2.1)
- **engine=None**: Multimodal encode worker passes `engine=None` to - **engine=None**: Multimodal encode worker passes `engine=None` to
BaseWorkerHandler. Any code in the base class that touches engine must guard with BaseWorkerHandler. Any code in the base class that touches engine must guard with
`if engine is not None`. `if engine is not None`.
- **GenerationResult is a dataclass**: SGLang 0.5.9 changed `DiffGenerator.generate()` - **GenerationResult is a dataclass**: SGLang `DiffGenerator.generate()`
to return `GenerationResult` (not a dict). Use `result.frames`, not `result["frames"]`. returns `GenerationResult` (not a dict). Use `result.frames`, not `result["frames"]`.
- **output_modalities default**: Global default is `["text"]`. Image/video diffusion - **output_modalities default**: Global default is `["text"]`. Image/video diffusion
workers must override to `["image"]`/`["video"]` or the Rust registration path tries workers must override to `["image"]`/`["video"]` or the Rust registration path tries
to load `config.json` (which doesn't exist for diffusers models). to load `config.json` (which doesn't exist for diffusers models).
......
...@@ -22,24 +22,28 @@ from typing import Any ...@@ -22,24 +22,28 @@ from typing import Any
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Network utilities: NetworkAddress, get_local_ip_auto, get_zmq_socket
#
# 0.5.10+: sglang.srt.utils.network (canonical)
# 0.5.9: sglang.srt.utils (get_local_ip_auto, get_zmq_socket only;
# NetworkAddress did not exist)
# ---------------------------------------------------------------------------
try: try:
from sglang.srt.utils.network import ( # noqa: F401 from sglang.srt.utils.network import ( # noqa: F401
NetworkAddress, NetworkAddress,
get_local_ip_auto, get_local_ip_auto,
get_zmq_socket, get_zmq_socket,
) )
_SGLANG_HAS_NETWORK_MODULE = True
except ImportError: except ImportError:
# Fallback for sglang <= 0.5.9. Remove when min supported version is 0.6.0+ # Fallback for sglang 0.5.9. Remove when min supported version is 0.5.10+
from sglang.srt.utils import ( # type: ignore[no-redef] # noqa: F401 from sglang.srt.utils import ( # type: ignore[no-redef] # noqa: F401
get_local_ip_auto, get_local_ip_auto,
get_zmq_socket, get_zmq_socket,
) )
_SGLANG_HAS_NETWORK_MODULE = False
logger.info( logger.info(
"sglang.srt.utils.network not found (sglang <= 0.5.9); " "sglang.srt.utils.network not found (sglang 0.5.9); "
"using compatibility shim for NetworkAddress" "using compatibility shim for NetworkAddress"
) )
...@@ -99,6 +103,36 @@ except ImportError: ...@@ -99,6 +103,36 @@ except ImportError:
return f"tcp://{self.host}:{self.port}" return f"tcp://{self.host}:{self.port}"
# ---------------------------------------------------------------------------
# MMEncoder._encode() adapter
#
# 0.5.10+: _encode(mm_items, modality) -> (grid_dim, embedding, aux_data)
# 0.5.9: _encode(mm_items) -> (grid_dim, embedding)
#
# Imports are deferred to avoid pulling sgl_kernel (CUDA-only) at module
# level, which breaks test collection on arm64 CPU-only CI nodes.
# ---------------------------------------------------------------------------
async def mm_encode(encoder: Any, mm_items: Any, modality: Any) -> tuple:
"""Version-safe wrapper around MMEncoder._encode().
Always returns (grid_dim, embedding, aux_data). On sglang 0.5.9
_encode takes no modality arg and returns a 2-tuple; on 0.5.10+ it
takes modality and returns a 3-tuple. We try the new signature first
and fall back to the old one.
"""
try:
result = await encoder._encode(mm_items, modality)
except TypeError:
# sglang 0.5.9: _encode(mm_items) -> (grid_dim, embedding)
result = await encoder._encode(mm_items)
if len(result) == 2:
return (*result, None)
return result
def enable_disjoint_streaming_output(server_args: Any) -> None: def enable_disjoint_streaming_output(server_args: Any) -> None:
""" """
Enable SGLang's disjoint streaming output across ServerArgs field renames. Enable SGLang's disjoint streaming output across ServerArgs field renames.
...@@ -137,5 +171,5 @@ __all__ = [ ...@@ -137,5 +171,5 @@ __all__ = [
"enable_disjoint_streaming_output", "enable_disjoint_streaming_output",
"get_local_ip_auto", "get_local_ip_auto",
"get_zmq_socket", "get_zmq_socket",
"_SGLANG_HAS_NETWORK_MODULE", "mm_encode",
] ]
...@@ -16,7 +16,6 @@ from dynamo.sglang._compat import NetworkAddress, get_local_ip_auto, get_zmq_soc ...@@ -16,7 +16,6 @@ from dynamo.sglang._compat import NetworkAddress, get_local_ip_auto, get_zmq_soc
if TYPE_CHECKING: if TYPE_CHECKING:
from prometheus_client import CollectorRegistry from prometheus_client import CollectorRegistry
from sglang.srt.managers.scheduler_metrics_mixin import KvMetrics
from dynamo.common.utils.prometheus import ( from dynamo.common.utils.prometheus import (
LLMBackendMetrics, LLMBackendMetrics,
...@@ -131,9 +130,8 @@ class DynamoSglangPublisher: ...@@ -131,9 +130,8 @@ class DynamoSglangPublisher:
while self._running: while self._running:
try: try:
# Receive KvMetrics object from SGLang scheduler via ZMQ # Receive KvMetrics object from SGLang scheduler via ZMQ
# KvMetrics class: sglang/srt/managers/scheduler_metrics_mixin.py lines 45-54 # KvMetrics class: sglang/srt/observability/scheduler_metrics_mixin.py
# Sent from: sglang/srt/managers/scheduler_metrics_mixin.py lines 482-499 (_emit_kv_metrics) kv_metrics = await self._sock.recv_pyobj()
kv_metrics: KvMetrics = await self._sock.recv_pyobj()
dp_rank = ( dp_rank = (
kv_metrics.data_parallel_rank kv_metrics.data_parallel_rank
if kv_metrics.data_parallel_rank is not None if kv_metrics.data_parallel_rank is not None
......
...@@ -12,8 +12,10 @@ import torch ...@@ -12,8 +12,10 @@ import torch
# MMEncoder chain imports compiled CUDA ops; may fail in CPU-only environments. # MMEncoder chain imports compiled CUDA ops; may fail in CPU-only environments.
try: try:
from sglang.srt.disaggregation.encode_server import MMEncoder from sglang.srt.disaggregation.encode_server import MMEncoder
from sglang.srt.managers.schedule_batch import Modality
except (ImportError, OSError): except (ImportError, OSError):
MMEncoder = None # type: ignore[assignment] MMEncoder = None # type: ignore[assignment]
Modality = None # type: ignore[assignment]
from sglang.srt.parser.conversation import chat_templates from sglang.srt.parser.conversation import chat_templates
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -24,6 +26,7 @@ from dynamo.common.memory.multimodal_embedding_cache_manager import ( ...@@ -24,6 +26,7 @@ from dynamo.common.memory.multimodal_embedding_cache_manager import (
) )
from dynamo.common.multimodal import EMBEDDING_SENDER_FACTORIES from dynamo.common.multimodal import EMBEDDING_SENDER_FACTORIES
from dynamo.common.utils import nvtx_utils as _nvtx from dynamo.common.utils import nvtx_utils as _nvtx
from dynamo.sglang._compat import mm_encode
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.protocol import ( from dynamo.sglang.protocol import (
MultiModalGroup, MultiModalGroup,
...@@ -215,7 +218,9 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s ...@@ -215,7 +218,9 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s
# SGLang's _encode outputs are already on CPU; use CPU as target for consistency # SGLang's _encode outputs are already on CPU; use CPU as target for consistency
target_device = torch.device("cpu") target_device = torch.device("cpu")
if uncached_urls: if uncached_urls:
grid_dim, new_embeddings = await self.encoder._encode(uncached_urls) grid_dim, new_embeddings, _aux = await mm_encode(
self.encoder, uncached_urls, Modality.IMAGE
)
# Verify SGLang output is on CPU as expected # Verify SGLang output is on CPU as expected
if new_embeddings.device != target_device: if new_embeddings.device != target_device:
logger.warning( logger.warning(
...@@ -335,9 +340,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s ...@@ -335,9 +340,11 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler[SglangMultimodalRequest, s
precomputed_embeddings, precomputed_embeddings,
) = await self._encode_with_cache(image_urls) ) = await self._encode_with_cache(image_urls)
else: else:
image_grid_dim, precomputed_embeddings = await self.encoder._encode( (
image_urls image_grid_dim,
) precomputed_embeddings,
_aux,
) = await mm_encode(self.encoder, image_urls, Modality.IMAGE)
image_grid_thw_list = ( image_grid_thw_list = (
image_grid_dim.tolist() image_grid_dim.tolist()
......
...@@ -42,6 +42,8 @@ async def test_encode_with_cache_partial_hit_and_reuse( ...@@ -42,6 +42,8 @@ async def test_encode_with_cache_partial_hit_and_reuse(
cache_handler: MultimodalEncodeWorkerHandler, cache_handler: MultimodalEncodeWorkerHandler,
) -> None: ) -> None:
"""Partial-hit should encode only misses and preserve URL order in output.""" """Partial-hit should encode only misses and preserve URL order in output."""
from sglang.srt.managers.schedule_batch import Modality
urls = [ urls = [
"http://example.com/a.jpg", "http://example.com/a.jpg",
"http://example.com/b.jpg", "http://example.com/b.jpg",
...@@ -60,12 +62,15 @@ async def test_encode_with_cache_partial_hit_and_reuse( ...@@ -60,12 +62,15 @@ async def test_encode_with_cache_partial_hit_and_reuse(
cache_handler.encoder._encode.return_value = ( cache_handler.encoder._encode.return_value = (
torch.tensor([[1, 2, 4], [1, 2, 2]]), torch.tensor([[1, 2, 4], [1, 2, 2]]),
encoded, encoded,
None, # aux_data (unused by cache path)
) )
grid, full_embeddings = await cache_handler._encode_with_cache(urls) grid, full_embeddings = await cache_handler._encode_with_cache(urls)
# Encoder called once for uncached URLs only # Encoder called once for uncached URLs only
cache_handler.encoder._encode.assert_awaited_once_with([urls[0], urls[2]]) cache_handler.encoder._encode.assert_awaited_once_with(
[urls[0], urls[2]], Modality.IMAGE
)
# Order should match original URL order: a(8), b(4 cached), c(4) # Order should match original URL order: a(8), b(4 cached), c(4)
assert grid.tolist() == [[1, 2, 4], [1, 2, 2], [1, 2, 2]] assert grid.tolist() == [[1, 2, 4], [1, 2, 2], [1, 2, 2]]
......
...@@ -128,8 +128,8 @@ python container/compliance/process_results.py \ ...@@ -128,8 +128,8 @@ python container/compliance/process_results.py \
|-----------|------|------------| |-----------|------|------------|
| `vllm` | 12.9 | `nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04` | | `vllm` | 12.9 | `nvcr.io/nvidia/cuda:12.9.1-runtime-ubuntu24.04` |
| `vllm` | 13.0 | `nvcr.io/nvidia/cuda:13.0.2-runtime-ubuntu24.04` | | `vllm` | 13.0 | `nvcr.io/nvidia/cuda:13.0.2-runtime-ubuntu24.04` |
| `sglang` | 12.9 | `lmsysorg/sglang:v0.5.9-runtime` | | `sglang` | 12.9 | `lmsysorg/sglang:v0.5.10.post1-runtime` |
| `sglang` | 13.0 | `lmsysorg/sglang:v0.5.9-cu130-runtime` | | `sglang` | 13.0 | `lmsysorg/sglang:v0.5.10.post1-cu130-runtime` |
| `trtllm` | 13.1 | `nvcr.io/nvidia/cuda-dl-base:25.12-cuda13.1-runtime-ubuntu24.04` | | `trtllm` | 13.1 | `nvcr.io/nvidia/cuda-dl-base:25.12-cuda13.1-runtime-ubuntu24.04` |
| `dynamo` frontend | — | `nvcr.io/nvidia/base/ubuntu:noble-20250619` | | `dynamo` frontend | — | `nvcr.io/nvidia/base/ubuntu:noble-20250619` |
......
...@@ -79,14 +79,14 @@ sglang: ...@@ -79,14 +79,14 @@ sglang:
base_image: nvcr.io/nvidia/cuda-dl-base base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang runtime_image: lmsysorg/sglang
base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04 base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
runtime_image_tag: v0.5.9-runtime runtime_image_tag: v0.5.10.post1-runtime
cuda13.0: cuda13.0:
base_image: nvcr.io/nvidia/cuda-dl-base base_image: nvcr.io/nvidia/cuda-dl-base
runtime_image: lmsysorg/sglang runtime_image: lmsysorg/sglang
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: v0.5.9-cu130-runtime runtime_image_tag: v0.5.10.post1-cu130-runtime
nixl_ref: 0.10.0 nixl_ref: 0.10.0
enable_media_ffmpeg: "false" enable_media_ffmpeg: "true"
enable_gpu_memory_service: "true" enable_gpu_memory_service: "true"
enable_kvbm: "false" enable_kvbm: "false"
......
...@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna ...@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna
| **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** | | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
| :--- | :--- | :--- | :--- | :--- | | :--- | :--- | :--- | :--- | :--- |
| **main (ToT)** | `0.5.9` | `1.3.0rc11` | `0.19.0` | `0.10.1` | | **main (ToT)** | `0.5.10.post1` | `1.3.0rc11` | `0.19.0` | `0.10.1` |
| **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` | | **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
| **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` | | **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
| **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` | | **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
......
...@@ -62,7 +62,7 @@ vllm = [ ...@@ -62,7 +62,7 @@ vllm = [
sglang = [ sglang = [
"uvloop", "uvloop",
"sglang[diffusion]==0.5.9", "sglang[diffusion]==0.5.10.post1",
"nixl[cu12]<=0.10.1", "nixl[cu12]<=0.10.1",
"cupy-cuda12x>=13.0.0", "cupy-cuda12x>=13.0.0",
] ]
...@@ -181,6 +181,11 @@ filterwarnings = [ ...@@ -181,6 +181,11 @@ filterwarnings = [
"error", "error",
# CUDA deprecation warnings from tensorrt_llm # CUDA deprecation warnings from tensorrt_llm
"ignore:.*cuda*:DeprecationWarning", "ignore:.*cuda*:DeprecationWarning",
# cuda.cudart/cuda.nvrtc deprecated in favor of cuda.bindings.* (cuda-python >=13)
# Triggered by flashinfer 0.6.7+ during import of comm/mnnvl.py
"ignore:The cuda\\..*module is deprecated:FutureWarning",
# SGLang GGUF quantization emits UserWarning on non-CUDA platforms (arm64 CPU-only CI)
"ignore:Only CUDA.*support GGUF quantization:UserWarning",
# protobuf C extension warning # protobuf C extension warning
"ignore:.*PyType_Spec.*custom tp_new.*:DeprecationWarning", "ignore:.*PyType_Spec.*custom tp_new.*:DeprecationWarning",
# unclosed socket/event loop warnings # unclosed socket/event loop warnings
......
...@@ -146,9 +146,13 @@ class SGLangProcess(ManagedEngineProcessMixin): ...@@ -146,9 +146,13 @@ class SGLangProcess(ManagedEngineProcessMixin):
str(page_size), str(page_size),
] ]
# Disable CUDA graphs for faster startup & lower memory # Disable CUDA graphs for faster startup & lower memory.
# sglang 0.5.10+ has piecewise CUDA graphs (separate flag) that
# consume ~7 GB during capture — must also be disabled for
# multi-worker same-GPU tests to avoid OOM.
if disable_cuda_graph: if disable_cuda_graph:
command.append("--disable-cuda-graph") command.append("--disable-cuda-graph")
command.append("--disable-piecewise-cuda-graph")
# Limit VRAM allocation (required for multi-worker on same GPU) # Limit VRAM allocation (required for multi-worker on same GPU)
if mem_fraction_static is not None: if mem_fraction_static is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment