refactor(vllm): scope ray import to scale_elastic_ep, remove module-level ray state (#7618)

Signed-off-by: tzulingk <tzulingk@nvidia.com>

refactor(vllm): scope ray import to scale_elastic_ep, remove module-level ray state (#7618)
Signed-off-by: tzulingk <tzulingk@nvidia.com>
4b8826b3 · Tzu-Ling Kan · GitHub · 7edb07b5 · 4b8826b3
Unverified Commit 4b8826b3 authored Mar 25, 2026 by Tzu-Ling Kan Committed by GitHub Mar 25, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 52 deletions

components/src/dynamo/vllm/handlers.py components/src/dynamo/vllm/handlers.py +34 -52

No files found.
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -13,16 +13,7 @@ import time
 from abc import ABC, abstractmethod
 from contextlib import asynccontextmanager
 from dataclasses import dataclass
-from typing import (
+from typing import Any, AsyncIterator, Dict, Final, Generic, Optional, TypeVar
-    TYPE_CHECKING,
-    Any,
-    AsyncIterator,
-    Dict,
-    Final,
-    Generic,
-    Optional,
-    TypeVar,
-)
 import torch
 from vllm.config import VllmConfig
@@ -65,48 +56,6 @@ from .multimodal_utils.hash_utils import compute_mm_uuids_from_images
 from .multimodal_utils.model import construct_qwen_decode_mm_data, is_qwen_vl_model
 from .multimodal_utils.prefill_worker_utils import MultiModalEmbeddingLoader
-if TYPE_CHECKING:
-    import ray
-    import ray.util.state as _ray_util_state
-try:
-    import ray
-    import ray.util.state as _ray_util_state
-except ModuleNotFoundError:
-    ray = None
-    _ray_util_state = None
-# TODO(upstream-vllm): remove this patch once vLLM fixes add_dp_placement_groups in
-# vllm/v1/engine/utils.py to use ray.nodes() instead of ray.util.state.list_nodes().
-#
-# Patch ray.util.state.list_nodes to use the GCS API instead of the dashboard HTTP
-# API (127.0.0.1:8265/api/v0/nodes). The dynamo image installs ray core only (not
-# ray[default]), so the dashboard HTTP server starts in --minimal mode with the HTTP
-# server disabled. vLLM's add_dp_placement_groups calls list_nodes() which requires
-# that HTTP endpoint, causing scale_elastic_ep to fail with "Failed to connect to
-# API server".
-#
-# ray.nodes() uses the GCS gRPC channel directly (no dashboard process needed) and
-# returns the same information. This patch makes elastic EP scaling self-contained.
-#
-# Format mapping:
-#   list_nodes() → objects with .node_ip and .node_id
-#   ray.nodes()  → dicts with "NodeManagerAddress" and "NodeID"
-class _NodeInfo:
-    __slots__ = ("node_ip", "node_id")
-    def __init__(self, d: dict) -> None:
-        self.node_ip: str = d["NodeManagerAddress"]
-        self.node_id: str = d["NodeID"]
-if ray is not None and _ray_util_state is not None:
-    _ray_util_state.list_nodes = lambda **kw: [
-        _NodeInfo(n) for n in ray.nodes() if n.get("Alive", False)
-    ]
 # Multimodal data dictionary keys
 IMAGE_URL_KEY: Final = "image_url"
 VIDEO_URL_KEY: Final = "video_url"
@@ -576,6 +525,39 @@ class BaseWorkerHandler(ABC, Generic[RequestT, ResponseT]):
        logger.info(f"[ElasticEP] Scaling to new_data_parallel_size={new_dp_size}")
        try:
+            # TODO(upstream-vllm): remove this patch once vLLM fixes
+            # add_dp_placement_groups in vllm/v1/engine/utils.py to use ray.nodes()
+            # instead of ray.util.state.list_nodes().
+            #
+            # Patch ray.util.state.list_nodes to use the GCS API instead of the
+            # dashboard HTTP API (127.0.0.1:8265/api/v0/nodes). The dynamo image
+            # installs ray core only (not ray[default]), so the dashboard HTTP server
+            # starts in --minimal mode with the HTTP server disabled. vLLM's
+            # add_dp_placement_groups calls list_nodes() which requires that HTTP
+            # endpoint, causing scale_elastic_ep to fail with "Failed to connect to
+            # API server".
+            #
+            # ray.nodes() uses the GCS gRPC channel directly (no dashboard process
+            # needed) and returns the same information. Imported lazily so ray is not
+            # required at module load time (absent in non-elastic-EP deployments).
+            #
+            # Format mapping:
+            #   list_nodes() → objects with .node_ip and .node_id
+            #   ray.nodes()  → dicts with "NodeManagerAddress" and "NodeID"
+            import ray
+            import ray.util.state as _ray_util_state
+            class _NodeInfo:
+                __slots__ = ("node_id", "node_ip")
+                def __init__(self, d: dict) -> None:
+                    self.node_ip: str = d["NodeManagerAddress"]
+                    self.node_id: str = d["NodeID"]
+            _ray_util_state.list_nodes = lambda **kw: [
+                _NodeInfo(n) for n in ray.nodes() if n.get("Alive", False)
+            ]
            await self.engine_client.scale_elastic_ep(new_dp_size)
            logger.info(f"[ElasticEP] Scaling to dp={new_dp_size} complete")
            return {