Unverified Commit 2d0dab74 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

feat: Sglang Request cancellation (#3465)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 6e8529fd
...@@ -108,7 +108,7 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -108,7 +108,7 @@ async def init(runtime: DistributedRuntime, config: Config):
try: try:
# Start endpoint immediately and register model concurrently # Start endpoint immediately and register model concurrently
# Requests queue until ready_event is set # Requests queue until ready_event is set (TODO: Part of new PR)
await asyncio.gather( await asyncio.gather(
generate_endpoint.serve_endpoint( generate_endpoint.serve_endpoint(
handler.generate, handler.generate,
......
...@@ -6,7 +6,7 @@ from typing import Optional ...@@ -6,7 +6,7 @@ from typing import Optional
import sglang as sgl import sglang as sgl
from dynamo._core import Component from dynamo._core import Component, Context
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.protocol import EmbeddingRequest from dynamo.sglang.protocol import EmbeddingRequest
from dynamo.sglang.publisher import DynamoSglangPublisher from dynamo.sglang.publisher import DynamoSglangPublisher
...@@ -29,7 +29,14 @@ class EmbeddingWorkerHandler(BaseWorkerHandler): ...@@ -29,7 +29,14 @@ class EmbeddingWorkerHandler(BaseWorkerHandler):
logging.info("Engine shutdown") logging.info("Engine shutdown")
super().cleanup() super().cleanup()
async def generate(self, request: dict): async def generate(self, request: dict, context: Context):
"""
Generate embeddings for the given input.
Args:
request: Embedding request dictionary.
context: Context object for cancellation handling.
"""
logging.debug(f"Embedding request: {request}") logging.debug(f"Embedding request: {request}")
# Parse the embedding request - should only receive EmbeddingRequest format # Parse the embedding request - should only receive EmbeddingRequest format
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio
import logging
import random import random
import socket import socket
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Dict, Optional, Tuple from contextlib import asynccontextmanager
from typing import Any, AsyncGenerator, Dict, Optional, Tuple
import sglang as sgl import sglang as sgl
from sglang.srt.utils import get_local_ip_auto from sglang.srt.utils import get_local_ip_auto
from dynamo._core import Client, Component from dynamo._core import Client, Component, Context
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.publisher import DynamoSglangPublisher from dynamo.sglang.publisher import DynamoSglangPublisher
...@@ -48,11 +51,12 @@ class BaseWorkerHandler(ABC): ...@@ -48,11 +51,12 @@ class BaseWorkerHandler(ABC):
self.skip_tokenizer_init = config.server_args.skip_tokenizer_init self.skip_tokenizer_init = config.server_args.skip_tokenizer_init
@abstractmethod @abstractmethod
async def generate(self, request: Dict[str, Any]): async def generate(self, request: Dict[str, Any], context: Context):
"""Generate response from request. """Generate response from request.
Args: Args:
request: Request dict with input and parameters. request: Request dict with input and parameters.
context: Context object for cancellation handling.
Yields: Yields:
Response data (format varies by handler implementation). Response data (format varies by handler implementation).
...@@ -112,3 +116,107 @@ class BaseWorkerHandler(ABC): ...@@ -112,3 +116,107 @@ class BaseWorkerHandler(ABC):
bootstrap_host = get_local_ip_auto() bootstrap_host = get_local_ip_auto()
return bootstrap_host, bootstrap_port return bootstrap_host, bootstrap_port
async def _handle_cancellation(
self, request_id_future: asyncio.Future, context: Context
):
"""Background task to handle cancellation by monitoring context state.
Args:
request_id_future: Future that will be set with the SGLang request ID
when the first response arrives.
context: Context object for cancellation handling.
"""
try:
logging.debug(f"Cancellation monitor started for Context: {context.id()}")
# Always wait for the request ID to ensure we can abort the request
sglang_request_id = await request_id_future
logging.debug(
f"Cancellation monitor received SGLang Request ID {sglang_request_id} for Context: {context.id()}"
)
logging.debug(f"Request ID future cancelled for Context: {context.id()}")
await context.async_killed_or_stopped()
logging.info(
f"Cancellation signal received for SGLang Request ID {sglang_request_id}, Context: {context.id()}"
)
# Call abort_request on the tokenizer_manager through the engine
if (
hasattr(self.engine, "tokenizer_manager")
and self.engine.tokenizer_manager
):
logging.info(
f"Calling SGLang abort_request for Request ID {sglang_request_id}"
)
self.engine.tokenizer_manager.abort_request(
rid=sglang_request_id, abort_all=False
)
logging.info(f"Aborted Request ID: {context.id()}")
else:
logging.error(
f"SGLang tokenizer_manager not found for abort request: {context.id()}"
)
except asyncio.CancelledError:
# Task was cancelled, which is expected when generation completes
request_id = "unknown"
if request_id_future.done() and not request_id_future.cancelled():
try:
request_id = request_id_future.result()
except Exception:
pass
logging.debug(
f"Cancellation monitor task cancelled for SGLang Request ID {request_id}, Context: {context.id()}"
)
raise
@asynccontextmanager
async def _cancellation_monitor(
self, request_id_future: asyncio.Future, context: Context
) -> AsyncGenerator[asyncio.Task, None]:
"""
Context manager for monitoring request cancellation.
Automatically creates a background task to monitor for cancellation and
cleans it up when the context exits.
Args:
request_id_future: Future that will be set with the SGLang request ID
when the first response arrives.
context: Context object for cancellation handling
Yields:
asyncio.Task: The cancellation monitoring task being managed
"""
logging.info(f"Creating cancellation monitor task for Context: {context.id()}")
# Start the cancellation monitoring task
cancellation_task = asyncio.create_task(
self._handle_cancellation(request_id_future, context)
)
try:
yield cancellation_task
finally:
# Clean up the background cancellation task
request_id = "unknown"
if request_id_future.done() and not request_id_future.cancelled():
try:
request_id = request_id_future.result()
except Exception:
pass
if not cancellation_task.done():
logging.debug(
f"Cancelling cancellation monitor task for SGLang Request ID {request_id}, Context: {context.id()}"
)
cancellation_task.cancel()
try:
await cancellation_task
except asyncio.CancelledError:
pass
else:
logging.debug(
f"Cancellation monitor task already completed for SGLang Request ID {request_id}, Context: {context.id()}"
)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio
import logging import logging
import time import time
from typing import Any, AsyncGenerator, Dict, Optional from typing import Any, AsyncGenerator, Dict, Optional
import sglang as sgl import sglang as sgl
from dynamo._core import Client, Component from dynamo._core import Client, Component, Context
from dynamo.sglang.args import Config, DisaggregationMode from dynamo.sglang.args import Config, DisaggregationMode
from dynamo.sglang.protocol import DisaggPreprocessedRequest from dynamo.sglang.protocol import DisaggPreprocessedRequest
from dynamo.sglang.publisher import DynamoSglangPublisher from dynamo.sglang.publisher import DynamoSglangPublisher
...@@ -96,12 +97,13 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -96,12 +97,13 @@ class DecodeWorkerHandler(BaseWorkerHandler):
return {k: v for k, v in param_mapping.items() if v is not None} return {k: v for k, v in param_mapping.items() if v is not None}
async def generate( async def generate(
self, request: Dict[str, Any] self, request: Dict[str, Any], context: Context
) -> AsyncGenerator[Dict[str, Any], None]: ) -> AsyncGenerator[Dict[str, Any], None]:
"""Generate response in aggregated or disaggregated mode. """Generate response in aggregated or disaggregated mode.
Args: Args:
request: Request dict with input and sampling parameters. request: Request dict with input and sampling parameters.
context: Context object for cancellation handling.
Yields: Yields:
Response dicts with token_ids or OpenAI-formatted chunks. Response dicts with token_ids or OpenAI-formatted chunks.
...@@ -109,6 +111,7 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -109,6 +111,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
Raises: Raises:
RuntimeError: If no bootstrap info received from prefill worker. RuntimeError: If no bootstrap info received from prefill worker.
""" """
logging.debug(f"New Request ID: {context.id()}")
sampling_params = self._build_sampling_params(request) sampling_params = self._build_sampling_params(request)
input_param = self._get_input_param(request) input_param = self._get_input_param(request)
...@@ -139,7 +142,8 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -139,7 +142,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
DisaggPreprocessedRequest( DisaggPreprocessedRequest(
request=request, request=request,
sampling_params=sampling_params, sampling_params=sampling_params,
).model_dump() ).model_dump(),
context=context,
) )
bootstrap_info = None bootstrap_info = None
...@@ -160,10 +164,10 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -160,10 +164,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
) )
if self.skip_tokenizer_init: if self.skip_tokenizer_init:
async for out in self._process_token_stream(decode): async for out in self._process_token_stream(decode, context):
yield out yield out
else: else:
async for out in self._process_text_stream(decode): async for out in self._process_text_stream(decode, context):
yield out yield out
else: else:
agg = await self.engine.async_generate( agg = await self.engine.async_generate(
...@@ -172,76 +176,116 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -172,76 +176,116 @@ class DecodeWorkerHandler(BaseWorkerHandler):
stream=True, stream=True,
) )
if self.skip_tokenizer_init: if self.skip_tokenizer_init:
async for out in self._process_token_stream(agg): async for out in self._process_token_stream(agg, context):
yield out yield out
else: else:
async for out in self._process_text_stream(agg): async for out in self._process_text_stream(agg, context):
yield out yield out
async def _process_token_stream( async def _process_token_stream(
self, stream_source: AsyncGenerator[Dict[str, Any], None] self,
stream_source: AsyncGenerator[Dict[str, Any], None],
context: Context,
) -> AsyncGenerator[Dict[str, Any], None]: ) -> AsyncGenerator[Dict[str, Any], None]:
"""Process token-based stream output. """Process token-based stream output.
Args: Args:
stream_source: Async generator from engine.async_generate. stream_source: Async generator from engine.async_generate.
context: Context object for cancellation handling.
Yields: Yields:
Dict with token_ids and optional finish_reason. Dict with token_ids and optional finish_reason.
""" """
num_output_tokens_so_far = 0 num_output_tokens_so_far = 0
async for res in stream_source: # Use Future pattern for request ID - will be set when first response arrives
out = {} request_id_future = asyncio.Future()
finish_reason = res["meta_info"]["finish_reason"] async with self._cancellation_monitor(request_id_future, context):
if finish_reason: async for res in stream_source:
out["finish_reason"] = finish_reason["type"] # Extract SGLang request ID from the first response and set the future
if not request_id_future.done():
output_ids = res.get("output_ids", []) meta_info = res.get("meta_info", {})
# If request is not finished yet, but there are no outputs, return an error. sglang_request_id = meta_info.get("id")
if not output_ids and not finish_reason: if sglang_request_id:
yield {"finish_reason": "error", "token_ids": []} request_id_future.set_result(sglang_request_id)
break logging.debug(f"New SGLang Request ID: {sglang_request_id}")
next_total_toks = len(output_ids) # Check cancellation before yielding to allow proper cleanup.
out["token_ids"] = output_ids[num_output_tokens_so_far:] # This lets SGLang proceed to the second token generation, which will
num_output_tokens_so_far = next_total_toks # async context switch and allow the abort monitor to signal cancellation.
yield out # The loop should exit by itself when context.is_stopped() returns True.
out = {}
finish_reason = res["meta_info"]["finish_reason"]
if finish_reason:
out["finish_reason"] = finish_reason["type"]
output_ids = res.get("output_ids", [])
# If request is not finished yet, but there are no outputs, return an error.
if not output_ids and not finish_reason:
if not context.is_stopped():
yield {"finish_reason": "error", "token_ids": []}
break
next_total_toks = len(output_ids)
out["token_ids"] = output_ids[num_output_tokens_so_far:]
num_output_tokens_so_far = next_total_toks
if not context.is_stopped():
yield out
async def _process_text_stream( async def _process_text_stream(
self, stream_source: AsyncGenerator[Dict[str, Any], None] self,
stream_source: AsyncGenerator[Dict[str, Any], None],
context: Context,
) -> AsyncGenerator[Dict[str, Any], None]: ) -> AsyncGenerator[Dict[str, Any], None]:
"""Process text-based stream output in OpenAI format. """Process text-based stream output in OpenAI format.
Args: Args:
stream_source: Async generator from engine.async_generate. stream_source: Async generator from engine.async_generate.
context: Context object for cancellation handling.
Yields: Yields:
OpenAI-formatted chat completion chunk dicts. OpenAI-formatted chat completion chunk dicts.
""" """
count = 0 count = 0
async for res in stream_source: # Use Future pattern for request ID - will be set when first response arrives
index = res.get("index", 0) request_id_future = asyncio.Future()
text = res.get("text", "") async with self._cancellation_monitor(request_id_future, context):
async for res in stream_source:
finish_reason = res["meta_info"]["finish_reason"] # Extract SGLang request ID from the first response and set the future
finish_reason_type = finish_reason["type"] if finish_reason else None if not request_id_future.done():
next_count = len(text) meta_info = res.get("meta_info", {})
delta = text[count:] sglang_request_id = meta_info.get("id")
if sglang_request_id:
choice_data = { request_id_future.set_result(sglang_request_id)
"index": index, logging.debug(f"New SGLang Request ID: {sglang_request_id}")
"delta": {"role": "assistant", "content": delta},
"finish_reason": finish_reason_type, # Check cancellation before yielding to allow proper cleanup.
} # This lets SGLang proceed to the second token generation, which will
# async context switch and allow the abort monitor to signal cancellation.
response = { # The loop should exit by itself when context.is_stopped() returns True.
"id": res["meta_info"]["id"],
"created": int(time.time()), index = res.get("index", 0)
"choices": [choice_data], text = res.get("text", "")
"model": self.config.server_args.served_model_name,
"object": "chat.completion.chunk", finish_reason = res["meta_info"]["finish_reason"]
} finish_reason_type = finish_reason["type"] if finish_reason else None
yield response next_count = len(text)
count = next_count delta = text[count:]
choice_data = {
"index": index,
"delta": {"role": "assistant", "content": delta},
"finish_reason": finish_reason_type,
}
response = {
"id": res["meta_info"]["id"],
"created": int(time.time()),
"choices": [choice_data],
"model": self.config.server_args.served_model_name,
"object": "chat.completion.chunk",
}
if not context.is_stopped():
yield response
count = next_count
...@@ -7,7 +7,7 @@ from typing import Any, AsyncGenerator, Dict ...@@ -7,7 +7,7 @@ from typing import Any, AsyncGenerator, Dict
import sglang as sgl import sglang as sgl
from dynamo._core import Component from dynamo._core import Component, Context
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.publisher import DynamoSglangPublisher from dynamo.sglang.publisher import DynamoSglangPublisher
from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler
...@@ -34,27 +34,36 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -34,27 +34,36 @@ class PrefillWorkerHandler(BaseWorkerHandler):
self.engine = engine self.engine = engine
self.bootstrap_host, self.bootstrap_port = self._get_bootstrap_info(self.engine) self.bootstrap_host, self.bootstrap_port = self._get_bootstrap_info(self.engine)
super().__init__(component, engine, config, publisher) super().__init__(component, engine, config, publisher)
self._consume_tasks = set()
logging.info( logging.info(
f"Prefill worker handler initialized - bootstrap host: {self.bootstrap_host}, bootstrap port: {self.bootstrap_port}" f"Prefill worker handler initialized - bootstrap host: {self.bootstrap_host}, bootstrap port: {self.bootstrap_port}"
) )
def cleanup(self) -> None: def cleanup(self) -> None:
"""Shutdown the prefill engine and cleanup resources.""" """Shutdown the prefill engine and cleanup resources."""
# Cancel all pending consume tasks
for task in self._consume_tasks:
if not task.done():
task.cancel()
self._consume_tasks.clear()
self.engine.shutdown() self.engine.shutdown()
logging.info("Prefill engine shutdown") logging.info("Prefill engine shutdown")
super().cleanup() super().cleanup()
async def generate( async def generate(
self, request: Dict[str, Any] self, request: Dict[str, Any], context: Context
) -> AsyncGenerator[Dict[str, Any], None]: ) -> AsyncGenerator[Dict[str, Any], None]:
"""Generate prefill output and provide bootstrap info for decode worker. """Generate prefill output and provide bootstrap info for decode worker.
Args: Args:
request: Request dict with 'request' and 'sampling_params' keys. request: Request dict with 'request' and 'sampling_params' keys.
context: Context object for cancellation handling.
Yields: Yields:
Bootstrap info dict with host, port, and room for decode worker connection. Bootstrap info dict with host, port, and room for decode worker connection.
""" """
logging.debug(f"New Request ID: {context.id()}")
bootstrap_room = self._generate_bootstrap_room() bootstrap_room = self._generate_bootstrap_room()
bootstrap_info = { bootstrap_info = {
...@@ -76,13 +85,31 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -76,13 +85,31 @@ class PrefillWorkerHandler(BaseWorkerHandler):
bootstrap_room=bootstrap_room, bootstrap_room=bootstrap_room,
) )
asyncio.create_task(self._consume_results(results)) task = asyncio.create_task(self._consume_results(results, context))
self._consume_tasks.add(task)
task.add_done_callback(self._consume_tasks.discard)
async def _consume_results(self, results: AsyncGenerator[Any, None]) -> None: async def _consume_results(
self, results: AsyncGenerator[Any, None], context: Context
) -> None:
"""Consume async generator results without processing. """Consume async generator results without processing.
Args: Args:
results: Async generator from engine.async_generate. results: Async generator from engine.async_generate.
context: Context object for cancellation handling.
""" """
async for _ in results: # Use Future pattern for request ID - will be set when first response arrives
pass request_id_future = asyncio.Future()
async with self._cancellation_monitor(request_id_future, context):
async for res in results:
# Extract SGLang request ID from the first response and set the future
if not request_id_future.done():
meta_info = res.get("meta_info", {})
sglang_request_id = meta_info.get("id")
if sglang_request_id:
request_id_future.set_result(sglang_request_id)
logging.debug(f"New Prefill Request ID: {sglang_request_id}")
# Note: No explicit cancellation checks needed here.
# When abort_request is called by the cancellation monitor,
# SGLang will terminate this async generator automatically.
...@@ -9,7 +9,7 @@ from sglang.srt.parser.conversation import chat_templates ...@@ -9,7 +9,7 @@ from sglang.srt.parser.conversation import chat_templates
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import dynamo.nixl_connect as connect import dynamo.nixl_connect as connect
from dynamo._core import Client, Component from dynamo._core import Client, Component, Context
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.multimodal_utils import ImageLoader, encode_image_embeddings from dynamo.sglang.multimodal_utils import ImageLoader, encode_image_embeddings
...@@ -90,7 +90,16 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler): ...@@ -90,7 +90,16 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
def cleanup(self): def cleanup(self):
pass pass
async def generate(self, request: SglangMultimodalRequest) -> AsyncIterator[str]: async def generate(
self, request: SglangMultimodalRequest, context: Context
) -> AsyncIterator[str]:
"""
Generate precomputed embeddings for multimodal input.
Args:
request: Multimodal request with image/video data.
context: Context object for cancellation handling.
"""
if not isinstance(request, SglangMultimodalRequest): if not isinstance(request, SglangMultimodalRequest):
if isinstance(request, str): if isinstance(request, str):
request = SglangMultimodalRequest.model_validate_json(request) request = SglangMultimodalRequest.model_validate_json(request)
......
...@@ -9,7 +9,7 @@ from typing import Any, Dict ...@@ -9,7 +9,7 @@ from typing import Any, Dict
from transformers import AutoTokenizer from transformers import AutoTokenizer
from dynamo._core import Client, Component from dynamo._core import Client, Component, Context
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.multimodal_utils import ( from dynamo.sglang.multimodal_utils import (
multimodal_request_to_sglang, multimodal_request_to_sglang,
...@@ -54,7 +54,14 @@ class MultimodalProcessorHandler(BaseWorkerHandler): ...@@ -54,7 +54,14 @@ class MultimodalProcessorHandler(BaseWorkerHandler):
def cleanup(self): def cleanup(self):
pass pass
async def generate(self, raw_request: MultiModalRequest): async def generate(self, raw_request: MultiModalRequest, context: Context):
"""
Process multimodal request and forward to encode worker.
Args:
raw_request: Raw multimodal request to process.
context: Context object for cancellation handling.
"""
if not isinstance(raw_request, MultiModalRequest): if not isinstance(raw_request, MultiModalRequest):
# If the request is not MultiModalRequest, convert it to MultiModalRequest # If the request is not MultiModalRequest, convert it to MultiModalRequest
raw_request = MultiModalRequest.model_validate(raw_request) raw_request = MultiModalRequest.model_validate(raw_request)
......
...@@ -10,7 +10,7 @@ import sglang as sgl ...@@ -10,7 +10,7 @@ import sglang as sgl
import torch import torch
import dynamo.nixl_connect as connect import dynamo.nixl_connect as connect
from dynamo._core import Client, Component from dynamo._core import Client, Component, Context
from dynamo.sglang.args import Config, DisaggregationMode from dynamo.sglang.args import Config, DisaggregationMode
from dynamo.sglang.protocol import ( from dynamo.sglang.protocol import (
DisaggSglangMultimodalRequest, DisaggSglangMultimodalRequest,
...@@ -275,10 +275,16 @@ class MultimodalWorkerHandler(BaseWorkerHandler): ...@@ -275,10 +275,16 @@ class MultimodalWorkerHandler(BaseWorkerHandler):
request = SglangMultimodalRequest.model_validate(request) request = SglangMultimodalRequest.model_validate(request)
return request return request
async def generate(self, request: SglangMultimodalRequest) -> AsyncIterator[str]: async def generate(
self, request: SglangMultimodalRequest, context: Context
) -> AsyncIterator[str]:
""" """
Generate response using SGLang with multimodal data Generate response using SGLang with multimodal data
Handles both aggregated and disaggregated modes (following regular SGLang DecodeWorkerHandler pattern) Handles both aggregated and disaggregated modes (following regular SGLang DecodeWorkerHandler pattern)
Args:
request: Multimodal request with input and parameters.
context: Context object for cancellation handling.
""" """
try: try:
request = self._validate_and_parse_request(request) request = self._validate_and_parse_request(request)
...@@ -429,10 +435,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler): ...@@ -429,10 +435,14 @@ class MultimodalPrefillWorkerHandler(BaseWorkerHandler):
await self.embeddings_processor.initialize() await self.embeddings_processor.initialize()
async def generate( async def generate(
self, disagg_request: DisaggSglangMultimodalRequest self, disagg_request: DisaggSglangMultimodalRequest, context: Context
) -> AsyncIterator[str]: ) -> AsyncIterator[str]:
""" """
Handle prefill phase: process multimodal input and provide bootstrap info Handle prefill phase: process multimodal input and provide bootstrap info
Args:
disagg_request: Disaggregated multimodal request.
context: Context object for cancellation handling.
""" """
bootstrap_room = None bootstrap_room = None
try: try:
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
import os
import shutil
import time
import pytest
from tests.fault_tolerance.cancellation.utils import (
DynamoFrontendProcess,
poll_for_pattern,
read_streaming_responses,
send_cancellable_request,
)
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
from tests.utils.engine_process import FRONTEND_PORT
from tests.utils.managed_process import ManagedProcess
from tests.utils.payloads import check_health_generate, check_models_api
logger = logging.getLogger(__name__)
class DynamoWorkerProcess(ManagedProcess):
"""Process manager for Dynamo worker with SGLang backend"""
def __init__(self, request, mode: str = "agg"):
"""
Initialize SGLang worker process.
Args:
request: pytest request object
mode: One of "agg", "prefill", "decode"
"""
command = [
"python3",
"-m",
"dynamo.sglang",
"--model-path",
FAULT_TOLERANCE_MODEL_NAME,
"--served-model-name",
FAULT_TOLERANCE_MODEL_NAME,
"--page-size",
"16",
"--tp",
"1",
"--trust-remote-code",
]
# Add mode-specific arguments
if mode == "agg":
# Aggregated mode - add skip-tokenizer-init like the serve test
command.append("--skip-tokenizer-init")
else:
# Disaggregated mode - add disaggregation arguments like disagg.sh
command.extend(
[
"--disaggregation-mode",
mode,
"--disaggregation-bootstrap-port",
"12345",
"--host",
"0.0.0.0",
"--disaggregation-transfer-backend",
"nixl",
]
)
health_check_urls = [
(f"http://localhost:{FRONTEND_PORT}/v1/models", check_models_api),
(f"http://localhost:{FRONTEND_PORT}/health", check_health_generate),
]
# Set port based on worker type
if mode == "prefill":
port = "8082"
health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)]
elif mode == "decode":
port = "8081"
health_check_urls = [(f"http://localhost:{port}/health", self.is_ready)]
else: # agg (aggregated mode)
port = "8081"
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
env["DYN_SYSTEM_ENABLED"] = "true"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port
# Set GPU assignment for disaggregated mode (like disagg.sh)
if mode == "decode":
env["CUDA_VISIBLE_DEVICES"] = "1" # Use GPU 1 for decode worker
elif mode == "prefill":
env["CUDA_VISIBLE_DEVICES"] = "0" # Use GPU 0 for prefill worker
# For agg (aggregated) mode, use default GPU assignment
# Set log directory based on worker type
log_dir = f"{request.node.name}_{mode}_worker"
# Clean up any existing log directory from previous runs
try:
shutil.rmtree(log_dir)
logger.info(f"Cleaned up existing log directory: {log_dir}")
except FileNotFoundError:
# Directory doesn't exist, which is fine
pass
super().__init__(
command=command,
env=env,
health_check_urls=health_check_urls,
timeout=300,
display_output=True,
terminate_existing=False,
# Ensure any orphaned SGLang engine cores or child helpers are cleaned up
stragglers=[
"SGLANG:EngineCore",
],
straggler_commands=[
"-m dynamo.sglang",
],
log_dir=log_dir,
)
self.mode = mode
def get_pid(self):
"""Get the PID of the worker process"""
return self.proc.pid if self.proc else None
def is_ready(self, response) -> bool:
"""Check the health of the worker process"""
try:
data = response.json()
if data.get("status") == "ready":
logger.info(f"{self.mode.capitalize()} worker status is ready")
return True
logger.warning(
f"{self.mode.capitalize()} worker status is not ready: {data.get('status')}"
)
except ValueError:
logger.warning(
f"{self.mode.capitalize()} worker health response is not valid JSON"
)
return False
@pytest.mark.e2e
@pytest.mark.sglang
@pytest.mark.gpu_1
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.xfail(strict=False)
def test_request_cancellation_sglang_aggregated(
request, runtime_services, predownload_models
):
"""
End-to-end test for request cancellation functionality in aggregated mode.
This test verifies that when a request is cancelled by the client,
the system properly handles the cancellation and cleans up resources
on the worker side in aggregated (agg) mode.
TODO: Test is currently flaky/failing due to SGLang limitations with prefill cancellation.
See: https://github.com/sgl-project/sglang/issues/11139
"""
logger.info("Sanity check if latest test is getting executed")
# Step 1: Start the frontend
with DynamoFrontendProcess(request) as frontend:
logger.info("Frontend started successfully")
# Step 2: Start an aggregated worker
with DynamoWorkerProcess(request, mode="agg") as worker:
logger.info(f"Aggregated Worker PID: {worker.get_pid()}")
# TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
time.sleep(2)
# Step 3: Test request cancellation with polling approach
frontend_log_offset, worker_log_offset = 0, 0
test_scenarios = [
("completion", "Completion request cancellation"),
("chat_completion", "Chat completion request cancellation"),
(
"chat_completion_stream",
"Chat completion stream request cancellation",
),
]
for request_type, description in test_scenarios:
logger.info(f"Testing {description.lower()}...")
# Send the request (non-blocking)
cancellable_req = send_cancellable_request(request_type)
# Poll for "New Request ID" pattern (Dynamo context ID)
request_id, worker_log_offset = poll_for_pattern(
process=worker,
pattern="New Request ID: ",
log_offset=worker_log_offset,
match_type="contains",
)
# For streaming, read one response first to trigger SGLang ID logging
if request_type == "chat_completion_stream":
read_streaming_responses(cancellable_req, expected_count=1)
# Wait for SGLang to actually start processing (get SGLang request ID)
_, worker_log_offset = poll_for_pattern(
process=worker,
pattern="New SGLang Request ID: ",
log_offset=worker_log_offset,
match_type="contains",
)
# Now we know SGLang has the request, cancel it
cancellable_req.cancel()
logger.info(f"Cancelled request ID: {request_id}")
# Poll for "Aborted Request ID" with matching ID
_, worker_log_offset = poll_for_pattern(
process=worker,
pattern=f"Aborted Request ID: {request_id}",
log_offset=worker_log_offset,
max_wait_ms=2000,
)
# Verify frontend log has kill message
_, frontend_log_offset = poll_for_pattern(
process=frontend,
pattern="issued control message Kill to sender",
log_offset=frontend_log_offset,
)
logger.info(f"{description} detected successfully")
@pytest.mark.e2e
@pytest.mark.sglang
@pytest.mark.gpu_2
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
def test_request_cancellation_sglang_decode_cancel(
request, runtime_services, predownload_models
):
"""
End-to-end test for request cancellation during remote decode phase.
This test verifies that when a request is cancelled by the client during the remote decode phase,
the system properly handles the cancellation and cleans up resources
on both the prefill and decode workers in a disaggregated setup.
Note: This test requires 2 GPUs to run decode and prefill workers on separate GPUs.
"""
# Step 1: Start the frontend
with DynamoFrontendProcess(request) as frontend:
logger.info("Frontend started successfully")
# Step 2: Start the decode worker
with DynamoWorkerProcess(request, mode="decode") as decode_worker:
logger.info(f"Decode Worker PID: {decode_worker.get_pid()}")
# Step 3: Start the prefill worker
with DynamoWorkerProcess(request, mode="prefill") as prefill_worker:
logger.info(f"Prefill Worker PID: {prefill_worker.get_pid()}")
# TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
time.sleep(2)
# Step 4: Test request cancellation during remote decode phase
logger.info(
"Testing chat completion stream request cancellation during remote decode phase..."
)
# Send streaming request (non-blocking)
cancellable_req = send_cancellable_request("chat_completion_stream")
# Poll for "New Request ID" pattern in decode worker (Dynamo context ID)
request_id, decode_log_offset = poll_for_pattern(
process=decode_worker,
pattern="New Request ID: ",
match_type="contains",
)
# Verify same request ID reached prefill worker
_, prefill_log_offset = poll_for_pattern(
process=prefill_worker,
pattern=f"New Request ID: {request_id}",
)
# Read one response first to trigger SGLang ID logging in decode worker
read_streaming_responses(cancellable_req, expected_count=1)
# Wait for SGLang to start processing in decode worker
_, decode_log_offset = poll_for_pattern(
process=decode_worker,
pattern="New SGLang Request ID: ",
log_offset=decode_log_offset,
match_type="contains",
)
# Now we know SGLang has the request in decode worker, cancel it
cancellable_req.cancel()
logger.info(f"Cancelled request ID: {request_id}")
# Poll for "Aborted Request ID" in decode worker
_, decode_log_offset = poll_for_pattern(
process=decode_worker,
pattern=f"Aborted Request ID: {request_id}",
log_offset=decode_log_offset,
)
# Verify frontend log has kill message
_, frontend_log_offset = poll_for_pattern(
process=frontend,
pattern="issued control message Kill to sender",
)
logger.info(
"Chat completion stream cancellation in decode phase detected successfully"
)
...@@ -116,7 +116,7 @@ sglang_configs = { ...@@ -116,7 +116,7 @@ sglang_configs = {
# NOTE: The response text may mention 'bus', 'train', 'streetcar', etc. # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
# so we need something consistently found in the response, or a different # so we need something consistently found in the response, or a different
# approach to validation for this test to be stable. # approach to validation for this test to be stable.
expected_response=["OUT OF SERVICE"], expected_response=["image"],
temperature=0.0, temperature=0.0,
) )
], ],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment