feat: FT Request Cancellation feature and test for 0.5.0 (#2500)

6c539fbd · Jacky · GitHub · 7fabe7bf · 6c539fbd · 6c539fbd
Unverified Commit 6c539fbd authored Sep 02, 2025 by Jacky Committed by GitHub Sep 02, 2025
4 changed files
--- a/lib/runtime/src/engine.rs
+++ b/lib/runtime/src/engine.rs
@@ -159,6 +159,12 @@ pub trait AsyncEngineContext: Send + Sync + Debug {
    /// terminate without draining the remaining items in the stream. This is implementation
    /// specific and may not be supported by all engines.
    fn kill(&self);
+
+    /// Links child AsyncEngineContext to this AsyncEngineContext. If the `stop_generating`, `stop`
+    /// or `kill` on this AsyncEngineContext is called, the same method is called on all linked
+    /// child AsyncEngineContext, in the order they are linked, and then the method on this
+    /// AsyncEngineContext continues.
+    fn link_child(&self, child: Arc<dyn AsyncEngineContext>);
 }

 /// Provides access to the [`AsyncEngineContext`] associated with an engine operation.

--- a/lib/runtime/src/pipeline/context.rs
+++ b/lib/runtime/src/pipeline/context.rs
@@ -22,7 +22,7 @@
 //!   registry and visitors. `StreamAdaptors` will amend themselves to the [`StreamContext`] to allow for the

 use std::ops::{Deref, DerefMut};
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};

 use super::{AsyncEngineContext, AsyncEngineContextProvider, Data};
 use crate::engine::AsyncEngineController;
@@ -300,6 +300,10 @@ impl AsyncEngineContext for StreamContext {
    async fn killed(&self) {
        self.controller.killed().await
    }
+
+    fn link_child(&self, child: Arc<dyn AsyncEngineContext>) {
+        self.controller.link_child(child);
+    }
 }

 impl AsyncEngineContextProvider for StreamContext {
@@ -331,12 +335,18 @@ pub struct Controller {
    id: String,
    tx: Sender<State>,
    rx: Receiver<State>,
+    child_context: Mutex<Vec<Arc<dyn AsyncEngineContext>>>,
 }

 impl Controller {
    pub fn new(id: String) -> Self {
        let (tx, rx) = channel(State::Live);
-        Self { id, tx, rx }
+        Self {
+            id,
+            tx,
+            rx,
+            child_context: Mutex::new(Vec::new()),
+        }
    }

    pub fn id(&self) -> &str {
@@ -383,16 +393,59 @@ impl AsyncEngineContext for Controller {
    }

    fn stop_generating(&self) {
-        self.stop();
+        // Clone child Arcs to avoid deadlock if parent is accidentally linked under child
+        let children = self
+            .child_context
+            .lock()
+            .expect("Failed to lock child context")
+            .iter()
+            .cloned()
+            .collect::<Vec<_>>();
+        for child in children {
+            child.stop_generating();
+        }
+
+        let _ = self.tx.send(State::Stopped);
    }

    fn stop(&self) {
+        // Clone child Arcs to avoid deadlock if parent is accidentally linked under child
+        let children = self
+            .child_context
+            .lock()
+            .expect("Failed to lock child context")
+            .iter()
+            .cloned()
+            .collect::<Vec<_>>();
+        for child in children {
+            child.stop();
+        }
+
        let _ = self.tx.send(State::Stopped);
    }

    fn kill(&self) {
+        // Clone child Arcs to avoid deadlock if parent is accidentally linked under child
+        let children = self
+            .child_context
+            .lock()
+            .expect("Failed to lock child context")
+            .iter()
+            .cloned()
+            .collect::<Vec<_>>();
+        for child in children {
+            child.kill();
+        }
+
        let _ = self.tx.send(State::Killed);
    }
+
+    fn link_child(&self, child: Arc<dyn AsyncEngineContext>) {
+        self.child_context
+            .lock()
+            .expect("Failed to lock child context")
+            .push(child);
+    }
 }

 #[cfg(test)]

--- a/tests/fault_tolerance/README.md
+++ b/tests/fault_tolerance/README.md
@@ -14,7 +14,7 @@ Tests worker fault tolerance with migration support using the `test_request_migr
   - Model: `deepseek-ai/DeepSeek-R1-Distill-Llama-8B`
   - `--enforce-eager`, `--gpu-memory-utilization 0.45`
   - `--max-model-len 8192`, `--migration-limit 3`
-3. Waits for both workers to be fully ready (looking for "Reading Events from" messages)
+3. Waits for both workers to be fully ready (health check returns "ready" status)
 4. Sends a test request ("Who are you?", 100 tokens) to determine which worker handles requests
 5. Determines primary/backup worker roles based on round-robin routing and log analysis
 6. Sends a long completion request ("Tell me a long long long story about yourself?", 8000 tokens) in a separate thread
@@ -22,12 +22,56 @@ Tests worker fault tolerance with migration support using the `test_request_migr
 8. Verifies the request completes successfully despite the worker failure (with 240s timeout)
 9. Checks that the frontend logs contain "Stream disconnected... recreating stream..." indicating migration occurred

+### `test_request_cancellation.py`
+
+Tests request cancellation functionality across multiple API endpoints and deployment configurations. Contains three test functions:
+
+#### `test_request_cancellation_vllm`
+Tests basic request cancellation with a single worker:
+
+0. Downloads the DeepSeek-R1-Distill-Llama-8B model from HuggingFace if not already cached
+1. Starts a Dynamo frontend using `python -m dynamo.frontend` with debug logging enabled
+2. Starts a single worker using `python3 -m dynamo.vllm` with specific configuration:
+   - Model: `deepseek-ai/DeepSeek-R1-Distill-Llama-8B`
+   - `--enforce-eager`, `--gpu-memory-utilization 0.45`, `--max-model-len 8192`, `--migration-limit 3`
+   - Debug logging enabled on port 8081
+3. Tests request cancellation across three scenarios:
+   - **Completion API**: `/v1/completions` endpoint cancellation
+   - **Chat Completion API (non-streaming)**: `/v1/chat/completions` endpoint cancellation
+   - **Chat Completion API (streaming)**: `/v1/chat/completions` with streaming cancellation
+4. For each scenario:
+   - Sends a long request with 1-second timeout to trigger cancellation
+   - Validates that cancellation messages appear in both frontend and worker logs
+   - Uses incremental log offset tracking to avoid false positives from previous tests
+5. Checks for specific cancellation patterns:
+   - Frontend log: "issued control message Kill to sender"
+   - Worker log: "Aborted Request ID: <request_id>" matching the "New Request ID: <request_id>"
+
+#### `test_request_cancellation_vllm_decode`
+Tests request cancellation during disaggregated decode phase:
+
+0. Downloads the DeepSeek-R1-Distill-Llama-8B model from HuggingFace if not already cached
+1. Starts a Dynamo frontend using `python -m dynamo.frontend` with debug logging enabled
+2. Starts a prefill worker using `python3 -m dynamo.vllm --is-prefill-worker` on port 8082
+3. Starts a decode worker using `python3 -m dynamo.vllm` on port 8081
+4. Tests completion request cancellation in the disaggregated setup
+5. Validates cancellation messages appear in prefill worker, decode worker, and frontend logs
+6. Checks for specific patterns:
+   - Frontend log: "issued control message Kill to sender"
+   - Decode worker log: "Aborted Request ID: <request_id>"
+   - Prefill worker log: "New Prefill Request ID: <request_id>"
+
+#### `test_request_cancellation_vllm_prefill`
+Tests request cancellation during disaggregated prefill phase:
+
+- (Skipped until request cancellation can cancel before receiving the first response)
+
 ## Prerequisites

- vLLM backend installed (`pip install ai-dynamo-vllm`)
+- vLLM backend installed
 - NATS and etcd services running (provided by `runtime_services` fixture)
 - Access to DeepSeek-R1-Distill-Llama-8B model (automatically downloaded from HuggingFace)
- Sufficient GPU memory (test uses 0.45 GPU memory utilization)
+- Sufficient GPU memory

 ## Running the Tests

@@ -35,16 +79,12 @@ To run the fault tolerance tests:

 ```bash
 # Run all fault tolerance tests
-pytest /workspace/tests/fault_tolerance
-
-# Run specific test with verbose output
-pytest /workspace/tests/fault_tolerance/test_request_migration.py::test_request_migration_vllm -v
-
-# Run with specific markers
 pytest -m "e2e and vllm" /workspace/tests/fault_tolerance

-# Run with debug logging
+# Run specific test functions with debug logging
 pytest /workspace/tests/fault_tolerance/test_request_migration.py::test_request_migration_vllm -v -s
+pytest /workspace/tests/fault_tolerance/test_request_cancellation.py::test_request_cancellation_vllm -v -s
+pytest /workspace/tests/fault_tolerance/test_request_cancellation.py::test_request_cancellation_vllm_decode -v -s
 ```

 ## Test Markers
@@ -61,11 +101,11 @@ pytest /workspace/tests/fault_tolerance/test_request_migration.py::test_request_

 ## Expected Test Duration

-The test typically takes 2-3 minutes to complete, including:
+The tests typically take 2-3 minutes to complete each, including:
 - Model download/loading time (if not cached) - can take 1-2 minutes for first run
 - Worker startup and registration
 - Request processing and response validation
- Worker failure simulation and migration
+- Worker failure simulation and migration (for migration test) / Request cancellation validation (for cancellation tests)
 - Cleanup

 ## Troubleshooting
@@ -74,7 +114,10 @@ If tests fail:

 1. Check that NATS and etcd services are running
 2. Verify vLLM backend is properly installed
-3. Ensure sufficient GPU memory is available (test requires ~45% GPU memory)
+3. Ensure sufficient GPU memory is available
 4. Check internet connectivity for model download from HuggingFace
 5. Review test logs for specific error messages
 6. Verify that the DeepSeek-R1-Distill-Llama-8B model can be accessed
+7. For cancellation tests: Check that timeout-based cancellation is working properly and cancellation patterns appear in logs
+8. For migration tests: Verify worker process termination and stream recreation behavior
+9. For disaggregated cancellation tests: Ensure both prefill and decode workers are properly started and cancellation works across the disaggregated setup
--- a/tests/fault_tolerance/test_request_cancellation.py
+++ b/tests/fault_tolerance/test_request_cancellation.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+import re
+import shutil
+import time
+
+import pytest
+import requests
+from huggingface_hub import snapshot_download
+
+from tests.utils.managed_process import ManagedProcess
+
+logger = logging.getLogger(__name__)
+
+
+class DynamoFrontendProcess(ManagedProcess):
+    """Process manager for Dynamo frontend"""
+
+    def __init__(self, request):
+        command = ["python", "-m", "dynamo.frontend"]
+
+        # Set debug logging environment
+        env = os.environ.copy()
+        env["DYN_LOG"] = "debug"
+
+        log_dir = f"{request.node.name}_frontend"
+
+        # Clean up any existing log directory from previous runs
+        try:
+            shutil.rmtree(log_dir)
+            logger.info(f"Cleaned up existing log directory: {log_dir}")
+        except FileNotFoundError:
+            # Directory doesn't exist, which is fine
+            pass
+
+        super().__init__(
+            command=command,
+            env=env,
+            display_output=True,
+            terminate_existing=True,
+            log_dir=log_dir,
+        )
+
+
+class DynamoWorkerProcess(ManagedProcess):
+    """Process manager for Dynamo worker with vLLM backend"""
+
+    def __init__(self, request, is_prefill: bool = False):
+        command = [
+            "python3",
+            "-m",
+            "dynamo.vllm",
+            "--model",
+            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            "--enforce-eager",
+            "--gpu-memory-utilization",
+            "0.45",
+            "--max-model-len",
+            "8192",
+            "--migration-limit",
+            "3",
+        ]
+
+        # Add prefill worker flag if needed
+        if is_prefill:
+            command.append("--is-prefill-worker")
+
+        # Set port based on worker type
+        port = "8082" if is_prefill else "8081"
+
+        # Set debug logging environment
+        env = os.environ.copy()
+        env["DYN_LOG"] = "debug"
+        env["DYN_SYSTEM_ENABLED"] = "true"
+        env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
+        env["DYN_SYSTEM_PORT"] = port
+
+        # Set log directory based on worker type
+        worker_type = "prefill_worker" if is_prefill else "worker"
+        log_dir = f"{request.node.name}_{worker_type}"
+
+        # Clean up any existing log directory from previous runs
+        try:
+            shutil.rmtree(log_dir)
+            logger.info(f"Cleaned up existing log directory: {log_dir}")
+        except FileNotFoundError:
+            # Directory doesn't exist, which is fine
+            pass
+
+        super().__init__(
+            command=command,
+            env=env,
+            health_check_urls=[(f"http://localhost:{port}/health", self.is_ready)],
+            timeout=300,
+            display_output=True,
+            terminate_existing=False,
+            log_dir=log_dir,
+        )
+
+        self.is_prefill = is_prefill
+
+    def get_pid(self):
+        """Get the PID of the worker process"""
+        return self.proc.pid if self.proc else None
+
+    def is_ready(self, response) -> bool:
+        """Check the health of the worker process"""
+        try:
+            data = response.json()
+            if data.get("status") == "ready":
+                worker_type = "Prefill worker" if self.is_prefill else "Worker"
+                logger.info(f"{worker_type} status is ready")
+                return True
+            worker_type = "Prefill worker" if self.is_prefill else "Worker"
+            logger.warning(f"{worker_type} status is not ready: {data.get('status')}")
+        except ValueError:
+            worker_type = "Prefill worker" if self.is_prefill else "Worker"
+            logger.warning(f"{worker_type} health response is not valid JSON")
+        return False
+
+
+def download_model() -> None:
+    """
+    Download the DeepSeek-R1-Distill-Llama-8B model from HuggingFace Hub if not already cached.
+    """
+    model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+    logger.info(f"Caching model {model_id}...")
+
+    max_retries = 5
+    retry_delay = 30  # seconds
+
+    for attempt in range(max_retries):
+        try:
+            # Download the model to the default cache directory
+            # This will skip download if the model is already cached
+            snapshot_download(
+                repo_id="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+                repo_type="model",
+                local_files_only=False,
+            )
+            logger.info(f"Model {model_id} is ready for use")
+            return  # Success, exit the function
+        except Exception as e:
+            if attempt < max_retries - 1:  # Not the last attempt
+                logger.warning(
+                    f"Failed to download model {model_id} (attempt {attempt + 1}/{max_retries}): {e}"
+                )
+                logger.info(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:  # Last attempt failed
+                logger.error(
+                    f"Failed to download model {model_id} after {max_retries} attempts: {e}"
+                )
+                raise
+
+
+def send_completion_request(
+    prompt: str, max_tokens: int, timeout: int = 120
+) -> requests.Response:
+    """Send a completion request to the frontend"""
+    payload = {
+        "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "prompt": prompt,
+        "max_tokens": max_tokens,
+    }
+
+    headers = {"Content-Type": "application/json"}
+
+    logger.info(
+        f"Sending completion request with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
+    )
+
+    session = requests.Session()
+    try:
+        response = session.post(
+            "http://localhost:8080/v1/completions",
+            headers=headers,
+            json=payload,
+            timeout=timeout,
+        )
+        logger.info(f"Received response with status code: {response.status_code}")
+        return response
+    except requests.exceptions.Timeout:
+        logger.error(f"Request timed out after {timeout} seconds")
+        raise
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request failed with error: {e}")
+        raise
+
+
+def send_chat_completion_request(
+    prompt: str, max_tokens: int, timeout: int = 120, stream: bool = False
+) -> requests.Response:
+    """Send a chat completion request to the frontend"""
+    payload = {
+        "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "stream": stream,
+    }
+
+    headers = {"Content-Type": "application/json"}
+
+    logger.info(
+        f"Sending chat completion request (stream={stream}) with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
+    )
+
+    session = requests.Session()
+    try:
+        response = session.post(
+            "http://localhost:8080/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=timeout,
+            stream=stream,
+        )
+        logger.info(f"Received response with status code: {response.status_code}")
+        return response
+    except requests.exceptions.Timeout:
+        logger.error(f"Request timed out after {timeout} seconds")
+        raise
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request failed with error: {e}")
+        raise
+
+
+def send_request_and_cancel(request_type: str = "completion", timeout: int = 1):
+    """Send a request with short timeout to trigger cancellation"""
+    logger.info(f"Sending {request_type} request to be cancelled...")
+
+    prompt = "Tell me a very long and detailed story about the history of artificial intelligence, including all major milestones, researchers, and breakthroughs?"
+    try:
+        if request_type == "completion":
+            response = send_completion_request(prompt, 8000, timeout)
+        elif request_type == "chat_completion":
+            response = send_chat_completion_request(prompt, 8000, timeout, False)
+        elif request_type == "chat_completion_stream":
+            response = send_chat_completion_request(prompt, 8000, timeout, True)
+            # Read a few responses and then disconnect
+            if response.status_code == 200:
+                itr_count, max_itr = 0, 5
+                try:
+                    for res in response.iter_lines():
+                        logger.info(f"Received response {itr_count + 1}: {res[:50]}...")
+                        itr_count += 1
+                        if itr_count >= max_itr:
+                            break
+                        time.sleep(0.1)
+                except Exception as e:
+                    pytest.fail(f"Stream reading failed: {e}")
+
+            response.close()
+            raise Exception("Closed response")
+        else:
+            pytest.fail(f"Unknown request type: {request_type}")
+
+        pytest.fail(
+            f"{request_type} request completed unexpectedly - should have been cancelled"
+        )
+    except Exception as e:
+        logger.info(f"{request_type} request was cancelled: {e}")
+
+
+def read_log_content(log_path: str | None) -> str:
+    """Read log content from a file"""
+    if log_path is None:
+        pytest.fail("Log path is None - cannot read log content")
+
+    try:
+        with open(log_path, "r") as f:
+            return f.read()
+    except Exception as e:
+        pytest.fail(f"Could not read log file {log_path}: {e}")
+
+
+def strip_ansi_codes(text: str) -> str:
+    """Remove ANSI color codes from text"""
+    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+    return ansi_escape.sub("", text)
+
+
+def verify_request_cancelled(
+    frontend_process: DynamoFrontendProcess,
+    worker_process: DynamoWorkerProcess,
+    prefill_worker_process: DynamoWorkerProcess | None = None,
+    frontend_log_offset: int = 0,
+    worker_log_offset: int = 0,
+    prefill_worker_log_offset: int = 0,
+) -> tuple[int, int]:
+    """Verify that the worker and frontend logs contain cancellation messages
+
+    Returns:
+        tuple: (new_worker_log_length, new_frontend_log_length)
+    """
+
+    # Check worker log for cancellation pattern
+    worker_log_content = read_log_content(worker_process._log_path)
+    new_worker_content = worker_log_content[worker_log_offset:]
+
+    # Find request ID from "New Request ID: <id>" line
+    request_id = None
+    for line in new_worker_content.split("\n"):
+        # Strip ANSI codes and whitespace for pattern matching
+        clean_line = strip_ansi_codes(line).strip()
+        if "New Request ID: " in clean_line:
+            # Extract ID from the end of the line
+            parts = clean_line.split("New Request ID: ")
+            if len(parts) > 1:
+                request_id = parts[-1].strip()
+                break
+    if request_id is None:
+        pytest.fail("Could not find 'New Request ID: <id>' pattern in worker log")
+
+    # Check if the same request ID was cancelled
+    has_worker_cancellation = False
+    cancellation_pattern = f"Aborted Request ID: {request_id}"
+    for line in new_worker_content.split("\n"):
+        # Strip ANSI codes and whitespace for pattern matching
+        clean_line = strip_ansi_codes(line).strip()
+        if clean_line.endswith(cancellation_pattern):
+            has_worker_cancellation = True
+            break
+    if not has_worker_cancellation:
+        pytest.fail(
+            f"Could not find 'Aborted Request ID: {request_id}' pattern in worker log"
+        )
+
+    # Check if the same request ID was remote prefilled
+    if prefill_worker_process is not None:
+        prefill_worker_log_content = read_log_content(prefill_worker_process._log_path)
+        new_prefill_worker_content = prefill_worker_log_content[
+            prefill_worker_log_offset:
+        ]
+
+        has_remote_prefill = False
+        remote_prefill_pattern = f"New Prefill Request ID: {request_id}"
+        for line in new_prefill_worker_content.split("\n"):
+            clean_line = strip_ansi_codes(line).strip()
+            if clean_line.endswith(remote_prefill_pattern):
+                has_remote_prefill = True
+                break
+        if not has_remote_prefill:
+            pytest.fail(
+                f"Could not find 'New Prefill Request ID: {request_id}' pattern in prefill worker log"
+            )
+
+    # Check frontend log for cancellation issued pattern
+    frontend_log_content = read_log_content(frontend_process._log_path)
+    new_frontend_content = frontend_log_content[frontend_log_offset:]
+
+    has_kill_message = False
+    kill_message = "issued control message Kill to sender"
+    for line in new_frontend_content.split("\n"):
+        # Strip ANSI codes and whitespace for pattern matching
+        clean_line = strip_ansi_codes(line).strip()
+        if clean_line.endswith(kill_message):
+            has_kill_message = True
+            break
+    if not has_kill_message:
+        pytest.fail("Could not find cancellation issued in frontend log")
+
+    return len(frontend_log_content), len(worker_log_content)
+
+
+@pytest.mark.vllm
+@pytest.mark.gpu_1
+@pytest.mark.e2e
+@pytest.mark.slow
+def test_request_cancellation_vllm(request, runtime_services):
+    """
+    End-to-end test for request cancellation functionality.
+
+    This test verifies that when a request is cancelled by the client,
+    the system properly handles the cancellation and cleans up resources
+    on the worker side. Tests three scenarios:
+    1. Completion request
+    2. Chat completion request (non-streaming)
+    3. Chat completion request (streaming)
+    """
+    # Step 0: Download the model from HuggingFace if not already cached
+    download_model()
+
+    # Step 1: Start the frontend
+    with DynamoFrontendProcess(request) as frontend:
+        logger.info("Frontend started successfully")
+
+        # Step 2: Start a single worker
+        logger.info("Starting worker...")
+        worker = DynamoWorkerProcess(request)
+
+        with worker:
+            logger.info(f"Worker PID: {worker.get_pid()}")
+
+            # TODO: Why the model is not immediately available at the frontend after health check
+            #       returns success.
+            time.sleep(2)
+
+            # Step 3: Test request cancellation
+            frontend_log_offset, worker_log_offset = 0, 0
+
+            test_scenarios = [
+                ("completion", "Completion request cancellation"),
+                ("chat_completion", "Chat completion request cancellation"),
+                (
+                    "chat_completion_stream",
+                    "Chat completion stream request cancellation",
+                ),
+            ]
+
+            for i, (request_type, description) in enumerate(test_scenarios, 1):
+                logger.info(f"Testing {description.lower()}...")
+                send_request_and_cancel(request_type)
+
+                logger.info(
+                    "Checking for cancellation messages in worker and frontend logs..."
+                )
+                time.sleep(0.5)  # Make sure logs are written before proceeding
+                frontend_log_offset, worker_log_offset = verify_request_cancelled(
+                    frontend,
+                    worker,
+                    frontend_log_offset=frontend_log_offset,
+                    worker_log_offset=worker_log_offset,
+                )
+
+                logger.info(f"{description} detected successfully")
+
+            logger.info(
+                "All request cancellation tests completed successfully - request cancellation is working correctly"
+            )
+
+
+@pytest.mark.vllm
+@pytest.mark.gpu_1
+@pytest.mark.e2e
+@pytest.mark.slow
+def test_request_cancellation_vllm_decode(request, runtime_services):
+    """
+    End-to-end test for request cancellation functionality with remote prefill.
+
+    This test verifies that when a request is cancelled by the client,
+    the system properly handles the cancellation and cleans up resources
+    on the decode worker side in a disaggregated setup.
+    """
+    # Step 0: Download the model from HuggingFace if not already cached
+    download_model()
+
+    # Step 1: Start the frontend
+    with DynamoFrontendProcess(request) as frontend:
+        logger.info("Frontend started successfully")
+
+        # Step 2: Start the prefill worker
+        logger.info("Starting prefill worker...")
+        prefill_worker = DynamoWorkerProcess(request, is_prefill=True)
+
+        with prefill_worker:
+            logger.info(f"Prefill Worker PID: {prefill_worker.get_pid()}")
+
+            # Step 3: Start the decode worker
+            logger.info("Starting decode worker...")
+            decode_worker = DynamoWorkerProcess(request, is_prefill=False)
+
+            with decode_worker:
+                logger.info(f"Decode Worker PID: {decode_worker.get_pid()}")
+
+                # TODO: Why the model is not immediately available at the frontend after health check
+                #       returns success.
+                time.sleep(2)
+
+                # Step 4: Test request cancellation for completion scenario only
+                logger.info(
+                    "Testing completion request cancellation in disaggregated mode..."
+                )
+                send_request_and_cancel("completion")
+
+                logger.info(
+                    "Checking for cancellation messages in decode worker, prefill worker, and frontend logs..."
+                )
+                time.sleep(0.5)  # Make sure logs are written before proceeding
+                verify_request_cancelled(frontend, decode_worker, prefill_worker)
+
+                logger.info(
+                    "Completion request cancellation detected successfully in disaggregated mode"
+                )
+
+                logger.info(
+                    "Request cancellation test completed successfully in disaggregated mode - request cancellation is working correctly"
+                )
+
+
+@pytest.mark.skip(reason="require cancel support before receiving 1st response")
+@pytest.mark.vllm
+@pytest.mark.gpu_1
+@pytest.mark.e2e
+@pytest.mark.slow
+def test_request_cancellation_vllm_prefill(request, runtime_services):
+    """
+    End-to-end test for request cancellation on remote prefill.
+
+    This test verifies that when a request is cancelled by the client during the
+    prefill phase, the system properly handles the cancellation and cleans up
+    resources on the prefill worker and decode worker sides in a disaggregated
+    setup.
+    """