Unverified Commit bc6be89d authored by Hyunkyun Moon's avatar Hyunkyun Moon Committed by GitHub
Browse files

[Frontend] Add vllm launch command for GPU-less preprocessing serving (#34551)


Signed-off-by: default avatarHyunKyun Moon <mhg5303@gmail.com>
parent 32224f56
......@@ -7,7 +7,7 @@ import httpx
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""E2E tests for render endpoints via `vllm launch` (GPU-less serving)."""
import httpx
import pytest
import pytest_asyncio
from ...utils import RemoteLaunchRenderServer
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
@pytest.fixture(scope="module")
def server():
args: list[str] = []
with RemoteLaunchRenderServer(MODEL_NAME, args, max_wait_seconds=120) as srv:
yield srv
@pytest_asyncio.fixture
async def client(server):
async with httpx.AsyncClient(
base_url=server.url_for(""), timeout=30.0
) as http_client:
yield http_client
# -- Chat Completion Render --
@pytest.mark.asyncio
async def test_chat_render_basic(client):
response = await client.post(
"/v1/chat/completions/render",
json={
"model": MODEL_NAME,
"messages": [{"role": "user", "content": "Hello, how are you?"}],
},
)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
assert len(data) == 2
conversation, engine_prompts = data
assert isinstance(conversation, list)
assert conversation[0]["role"] == "user"
assert isinstance(engine_prompts, list)
assert len(engine_prompts) > 0
first_prompt = engine_prompts[0]
assert "prompt_token_ids" in first_prompt
assert "prompt" in first_prompt
assert isinstance(first_prompt["prompt_token_ids"], list)
assert all(isinstance(t, int) for t in first_prompt["prompt_token_ids"])
@pytest.mark.asyncio
async def test_chat_render_multi_turn(client):
response = await client.post(
"/v1/chat/completions/render",
json={
"model": MODEL_NAME,
"messages": [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"},
],
},
)
assert response.status_code == 200
conversation, engine_prompts = response.json()
assert len(conversation) == 3
assert conversation[0]["role"] == "user"
assert conversation[1]["role"] == "assistant"
assert conversation[2]["role"] == "user"
assert len(engine_prompts) > 0
assert len(engine_prompts[0]["prompt_token_ids"]) > 0
@pytest.mark.asyncio
async def test_chat_render_invalid_model(client):
response = await client.post(
"/v1/chat/completions/render",
json={
"model": "nonexistent-model",
"messages": [{"role": "user", "content": "Hello"}],
},
)
assert response.status_code == 404
assert "error" in response.json()
# -- Completion Render --
@pytest.mark.asyncio
async def test_completion_render_basic(client):
response = await client.post(
"/v1/completions/render",
json={
"model": MODEL_NAME,
"prompt": "Once upon a time",
},
)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
assert len(data) > 0
first_prompt = data[0]
assert "prompt_token_ids" in first_prompt
assert "prompt" in first_prompt
assert isinstance(first_prompt["prompt_token_ids"], list)
assert len(first_prompt["prompt_token_ids"]) > 0
assert "Once upon a time" in first_prompt["prompt"]
@pytest.mark.asyncio
async def test_completion_render_multiple_prompts(client):
response = await client.post(
"/v1/completions/render",
json={
"model": MODEL_NAME,
"prompt": ["Hello world", "Goodbye world"],
},
)
assert response.status_code == 200
data = response.json()
assert isinstance(data, list)
assert len(data) == 2
for prompt in data:
assert "prompt_token_ids" in prompt
assert "prompt" in prompt
assert len(prompt["prompt_token_ids"]) > 0
@pytest.mark.asyncio
async def test_completion_render_invalid_model(client):
response = await client.post(
"/v1/completions/render",
json={
"model": "nonexistent-model",
"prompt": "Hello",
},
)
assert response.status_code == 404
assert "error" in response.json()
@pytest.mark.asyncio
async def test_render_is_fast(client):
"""Render should complete quickly since there is no inference."""
import time
start = time.perf_counter()
response = await client.post(
"/v1/completions/render",
json={
"model": MODEL_NAME,
"prompt": "Tell me a very long story about " * 10,
},
)
elapsed = time.perf_counter() - start
assert response.status_code == 200
assert elapsed < 2.0
# -- Health & Models --
@pytest.mark.asyncio
async def test_health_endpoint(client):
response = await client.get("/health")
assert response.status_code == 200
@pytest.mark.asyncio
async def test_models_endpoint(client):
response = await client.get("/v1/models")
assert response.status_code == 200
data = response.json()
assert "data" in data
model_ids = [m["id"] for m in data["data"]]
assert MODEL_NAME in model_ids
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Unit tests for the `vllm launch` CLI subcommand."""
import argparse
from unittest.mock import patch
import pytest
from vllm.entrypoints.cli.launch import (
LaunchSubcommand,
RenderSubcommand,
cmd_init,
)
from vllm.utils.argparse_utils import FlexibleArgumentParser
@pytest.fixture
def launch_parser():
parser = FlexibleArgumentParser(description="test")
subparsers = parser.add_subparsers(required=False, dest="subparser")
LaunchSubcommand().subparser_init(subparsers)
return parser
def test_subcommand_name():
assert LaunchSubcommand().name == "launch"
def test_cmd_init_returns_subcommand():
result = cmd_init()
assert len(result) == 1
assert isinstance(result[0], LaunchSubcommand)
# -- Parsing: `vllm launch render` --
def test_parse_launch_render(launch_parser):
args = launch_parser.parse_args(["launch", "render", "--model", "test-model"])
assert args.launch_component == "render"
def test_parse_launch_requires_component(launch_parser):
with pytest.raises(SystemExit):
launch_parser.parse_args(["launch", "--model", "test-model"])
def test_parse_launch_invalid_component(launch_parser):
with pytest.raises(SystemExit):
launch_parser.parse_args(["launch", "unknown", "--model", "test-model"])
# -- Dispatch --
def test_cmd_launch_render_calls_run():
args = argparse.Namespace(model_tag=None, model="test-model")
with patch("vllm.entrypoints.cli.launch.uvloop.run") as mock_uvloop_run:
RenderSubcommand.cmd(args)
mock_uvloop_run.assert_called_once()
def test_cmd_launch_model_tag_overrides():
args = argparse.Namespace(
model_tag="tag-model",
model="original-model",
launch_command=lambda a: None,
)
LaunchSubcommand.cmd(args)
assert args.model == "tag-model"
def test_cmd_launch_model_tag_none():
args = argparse.Namespace(
model_tag=None,
model="original-model",
launch_command=lambda a: None,
)
LaunchSubcommand.cmd(args)
assert args.model == "original-model"
def test_cmd_dispatches():
called = {}
def fake_dispatch(args):
called["args"] = args
args = argparse.Namespace(launch_command=fake_dispatch)
LaunchSubcommand.cmd(args)
assert "args" in called
# -- Module registration --
def test_subparser_init_returns_parser():
parser = FlexibleArgumentParser(description="test")
subparsers = parser.add_subparsers(required=False, dest="subparser")
result = LaunchSubcommand().subparser_init(subparsers)
assert isinstance(result, FlexibleArgumentParser)
def test_launch_registered_in_main():
"""Verify that launch module is importable as a CLI module."""
import vllm.entrypoints.cli.launch as launch_module
assert hasattr(launch_module, "cmd_init")
subcmds = launch_module.cmd_init()
assert any(s.name == "launch" for s in subcmds)
......@@ -110,31 +110,25 @@ VLLM_PATH = Path(__file__).parent.parent
"""Path to root of the vLLM repository."""
class RemoteOpenAIServer:
class RemoteVLLMServer:
"""Base class for launching vLLM server subprocesses for testing.
Subclasses must override ``_create_cli_subcommand`` and
``_start_server``.
"""
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
proc: subprocess.Popen
def _create_cli_subcommand(self):
"""Return a CLISubcommand instance used to parse CLI args."""
raise NotImplementedError
def _start_server(
self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
) -> None:
"""Subclasses override this method to customize server process launch"""
env = os.environ.copy()
# the current process might initialize cuda,
# to be safe, we should use spawn method
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if env_dict is not None:
env.update(env_dict)
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
print(f"Environment variables: {env}")
self.proc: subprocess.Popen = subprocess.Popen(
serve_cmd,
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
# Create a dedicated process group so we can kill
# the entire tree (parent + EngineCore + workers) at once.
start_new_session=True,
)
raise NotImplementedError
def __init__(
self,
......@@ -171,9 +165,9 @@ class RemoteOpenAIServer:
json.dumps(override_hf_configs),
]
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
parser = FlexibleArgumentParser(description="vLLM's remote server.")
subparsers = parser.add_subparsers(required=False, dest="subparser")
parser = ServeSubcommand().subparser_init(subparsers)
parser = self._create_cli_subcommand().subparser_init(subparsers)
args = parser.parse_args(["--model", model, *vllm_serve_args])
self.uds = args.uds
if args.uds:
......@@ -183,7 +177,9 @@ class RemoteOpenAIServer:
self.host = str(args.host or "127.0.0.1")
self.port = int(args.port)
self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
self.show_hidden_metrics = (
getattr(args, "show_hidden_metrics_for_version", None) is not None
)
# download the model before starting the server to avoid timeout
is_local = os.path.isdir(model)
......@@ -201,7 +197,8 @@ class RemoteOpenAIServer:
if self._pre_server_gpu_memory is not None:
pre_gb = self._pre_server_gpu_memory / 1e9
print(
f"[RemoteOpenAIServer] GPU memory before server start: {pre_gb:.2f} GB"
f"[{type(self).__name__}] GPU memory before server start: "
f"{pre_gb:.2f} GB"
)
self._start_server(model, vllm_serve_args, env_dict)
......@@ -452,6 +449,62 @@ class RemoteOpenAIServer:
)
class RemoteOpenAIServer(RemoteVLLMServer):
"""Launches ``vllm serve`` for testing OpenAI-compatible endpoints."""
def _create_cli_subcommand(self):
return ServeSubcommand()
def _start_server(
self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
) -> None:
env = os.environ.copy()
# the current process might initialize cuda,
# to be safe, we should use spawn method
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if env_dict is not None:
env.update(env_dict)
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
print(f"Environment variables: {env}")
self.proc: subprocess.Popen = subprocess.Popen(
serve_cmd,
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
# Create a dedicated process group so we can kill
# the entire tree (parent + EngineCore + workers) at once.
start_new_session=True,
)
class RemoteLaunchRenderServer(RemoteVLLMServer):
"""Launches ``vllm launch render`` for GPU-less serving tests."""
def _create_cli_subcommand(self):
return ServeSubcommand()
def _start_server(
self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
) -> None:
env = os.environ.copy()
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if env_dict is not None:
env.update(env_dict)
serve_cmd = ["vllm", "launch", "render", model, *vllm_serve_args]
print(f"Launching RemoteLaunchRenderServer with: {' '.join(serve_cmd)}")
self.proc: subprocess.Popen = subprocess.Popen(
serve_cmd,
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
start_new_session=True,
)
def _wait_for_gpu_memory_release(self, timeout: float = 30.0):
pass # No GPU used
class RemoteOpenAIServerCustom(RemoteOpenAIServer):
"""Launch test server with custom child process"""
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import uvloop
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.cli.types import CLISubcommand
from vllm.entrypoints.openai.api_server import (
build_and_serve,
setup_server,
)
from vllm.entrypoints.openai.cli_args import (
make_arg_parser,
validate_parsed_serve_args,
)
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
from vllm.logger import init_logger
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)
DESCRIPTION = "Launch individual vLLM components."
class LaunchSubcommandBase(CLISubcommand):
"""The base class of subcommands for `vllm launch`."""
help: str
@classmethod
def add_cli_args(cls, parser: FlexibleArgumentParser) -> None:
"""Add the CLI arguments to the parser.
By default, adds the standard vLLM serving arguments.
Subclasses can override to add component-specific arguments.
"""
make_arg_parser(parser)
@staticmethod
def cmd(args: argparse.Namespace) -> None:
raise NotImplementedError
class RenderSubcommand(LaunchSubcommandBase):
"""The `render` subcommand for `vllm launch`."""
name = "render"
help = "Launch a GPU-less rendering server (preprocessing and postprocessing only)."
@staticmethod
def cmd(args: argparse.Namespace) -> None:
uvloop.run(run_launch_fastapi(args))
class LaunchSubcommand(CLISubcommand):
"""The `launch` subcommand for the vLLM CLI.
Uses nested sub-subcommands so each component can define its own
arguments independently (e.g. ``vllm launch render``).
"""
name = "launch"
@staticmethod
def cmd(args: argparse.Namespace) -> None:
if hasattr(args, "model_tag") and args.model_tag is not None:
args.model = args.model_tag
args.launch_command(args)
def validate(self, args: argparse.Namespace) -> None:
validate_parsed_serve_args(args)
def subparser_init(
self, subparsers: argparse._SubParsersAction
) -> FlexibleArgumentParser:
launch_parser = subparsers.add_parser(
self.name,
help=DESCRIPTION,
description=DESCRIPTION,
usage=f"vllm {self.name} <component> [options]",
)
launch_subparsers = launch_parser.add_subparsers(
required=True, dest="launch_component"
)
for cmd_cls in LaunchSubcommandBase.__subclasses__():
cmd_subparser = launch_subparsers.add_parser(
cmd_cls.name,
help=cmd_cls.help,
description=cmd_cls.help,
usage=f"vllm {self.name} {cmd_cls.name} [options]",
)
cmd_subparser.set_defaults(launch_command=cmd_cls.cmd)
cmd_cls.add_cli_args(cmd_subparser)
cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
subcmd=f"{self.name} {cmd_cls.name}"
)
return launch_parser
def cmd_init() -> list[CLISubcommand]:
return [LaunchSubcommand()]
async def run_launch_fastapi(args: argparse.Namespace) -> None:
"""Run the online serving layer with FastAPI (no GPU inference)."""
from vllm.config import VllmConfig
from vllm.v1.engine.launch import LaunchEngineClient
# 1. Socket binding
listen_address, sock = setup_server(args)
# 2. Create LaunchEngineClient (no GPU)
engine_args = AsyncEngineArgs.from_cli_args(args)
model_config = engine_args.create_model_config()
vllm_config = VllmConfig(model_config=model_config)
engine_client = LaunchEngineClient.from_vllm_config(vllm_config)
# 3. Build app, initialize state, and start serving
shutdown_task = await build_and_serve(engine_client, listen_address, sock, args)
try:
await shutdown_task
finally:
sock.close()
......@@ -16,6 +16,7 @@ logger = init_logger(__name__)
def main():
import vllm.entrypoints.cli.benchmark.main
import vllm.entrypoints.cli.collect_env
import vllm.entrypoints.cli.launch
import vllm.entrypoints.cli.openai
import vllm.entrypoints.cli.run_batch
import vllm.entrypoints.cli.serve
......@@ -25,6 +26,7 @@ def main():
CMD_MODULES = [
vllm.entrypoints.cli.openai,
vllm.entrypoints.cli.serve,
vllm.entrypoints.cli.launch,
vllm.entrypoints.cli.benchmark.main,
vllm.entrypoints.cli.collect_env,
vllm.entrypoints.cli.run_batch,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import importlib
import inspect
import multiprocessing
......@@ -194,7 +195,7 @@ def build_app(
register_sagemaker_api_router(app, supported_tasks)
if "generate" in supported_tasks:
if any(task in supported_tasks for task in ("generate", "render")):
from vllm.entrypoints.openai.generate.api_router import (
register_generate_api_routers,
)
......@@ -357,7 +358,7 @@ async def init_app_state(
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks:
if any(task in supported_tasks for task in ("generate", "render")):
from vllm.entrypoints.openai.generate.api_router import init_generate_state
await init_generate_state(
......@@ -469,48 +470,32 @@ def setup_server(args):
return listen_address, sock
async def run_server(args, **uvicorn_kwargs) -> None:
"""Run a single-worker API server."""
# Add process-specific prefix to stdout and stderr.
decorate_logs("APIServer")
listen_address, sock = setup_server(args)
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
async def run_server_worker(
listen_address, sock, args, client_config=None, **uvicorn_kwargs
) -> None:
"""Run a single API server worker."""
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
async def build_and_serve(
engine_client: EngineClient,
listen_address: str,
sock: socket.socket,
args: Namespace,
**uvicorn_kwargs,
) -> asyncio.Task:
"""Build FastAPI app, initialize state, and start serving.
if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
Returns the shutdown task for the caller to await.
"""
# Get uvicorn log config (from file or with endpoint filter)
log_config = get_uvicorn_log_config(args)
if log_config is not None:
uvicorn_kwargs["log_config"] = log_config
async with build_async_engine_client(
args,
client_config=client_config,
) as engine_client:
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
app = build_app(args, supported_tasks)
await init_app_state(engine_client, app.state, args, supported_tasks)
logger.info(
"Starting vLLM API server %d on %s",
engine_client.vllm_config.parallel_config._api_process_rank,
listen_address,
)
shutdown_task = await serve_http(
logger.info("Starting vLLM server on %s", listen_address)
return await serve_http(
app,
sock=sock,
enable_ssl_refresh=args.enable_ssl_refresh,
......@@ -531,6 +516,35 @@ async def run_server_worker(
**uvicorn_kwargs,
)
async def run_server(args, **uvicorn_kwargs) -> None:
"""Run a single-worker API server."""
# Add process-specific prefix to stdout and stderr.
decorate_logs("APIServer")
listen_address, sock = setup_server(args)
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
async def run_server_worker(
listen_address, sock, args, client_config=None, **uvicorn_kwargs
) -> None:
"""Run a single API server worker."""
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
async with build_async_engine_client(
args,
client_config=client_config,
) as engine_client:
shutdown_task = await build_and_serve(
engine_client, listen_address, sock, args, **uvicorn_kwargs
)
# NB: Await server shutdown only after the backend context is exited
try:
await shutdown_task
......
......@@ -113,7 +113,7 @@ async def init_generate_state(
enable_log_deltas=args.enable_log_deltas,
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks
if any(task in supported_tasks for task in ("generate", "render"))
else None
)
# Warm up chat template processing to avoid first-request latency
......@@ -129,7 +129,7 @@ async def init_generate_state(
enable_force_include_usage=args.enable_force_include_usage,
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks
if any(task in supported_tasks for task in ("generate", "render"))
else None
)
state.anthropic_serving_messages = (
......
......@@ -10,4 +10,7 @@ PoolingTask = Literal[
]
POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask)
SupportedTask = Literal[GenerationTask, PoolingTask]
FrontendTask = Literal["render"]
FRONTEND_TASKS: tuple[FrontendTask, ...] = get_args(FrontendTask)
SupportedTask = Literal[GenerationTask, PoolingTask, FrontendTask]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LaunchEngineClient: A lightweight EngineClient for GPU-less online serving.
This implements the EngineClient protocol without AsyncLLM or EngineCore,
enabling preprocessing (tokenization, rendering) and postprocessing
(detokenization) without GPU inference.
"""
from collections.abc import AsyncGenerator, Iterable, Mapping
from typing import Any
from vllm.config import VllmConfig
from vllm.engine.protocol import EngineClient, StreamingInput
from vllm.inputs import ProcessorInputs, PromptType
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.renderers import renderer_from_config
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.v1.engine import EngineCoreRequest, PauseMode
from vllm.v1.engine.input_processor import InputProcessor
logger = init_logger(__name__)
class LaunchEngineClient(EngineClient):
"""GPU-less EngineClient that only supports preprocessing/postprocessing.
This is a Null Object at the EngineClient level, bypassing AsyncLLM
entirely. It initializes renderer, io_processor, and input_processor
for tokenization and rendering, but raises NotImplementedError for
any inference-related operations.
"""
def __init__(
self,
vllm_config: VllmConfig,
) -> None:
self.vllm_config = vllm_config
self.model_config = vllm_config.model_config
self.renderer = renderer = renderer_from_config(self.vllm_config)
self.io_processor = get_io_processor(
self.vllm_config,
self.renderer,
self.model_config.io_processor_plugin,
)
# Convert TokPrompt --> EngineCoreRequest.
self.input_processor = InputProcessor(self.vllm_config, renderer)
@classmethod
def from_vllm_config(
cls,
vllm_config: VllmConfig,
) -> "LaunchEngineClient":
"""Create a LaunchEngineClient from a VllmConfig without GPU."""
return cls(
vllm_config=vllm_config,
)
# -- Task support --
async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
return ("render",)
# -- Inference (not supported) --
async def generate(
self,
prompt: EngineCoreRequest
| PromptType
| ProcessorInputs
| AsyncGenerator[StreamingInput, None],
sampling_params: SamplingParams,
request_id: str,
*,
prompt_text: str | None = None,
lora_request: LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
trace_headers: Mapping[str, str] | None = None,
priority: int = 0,
data_parallel_rank: int | None = None,
reasoning_ended: bool | None = None,
) -> AsyncGenerator[RequestOutput, None]:
raise NotImplementedError(
"LaunchEngineClient does not support inference. "
"Use vllm serve for generation requests."
)
# yield is needed to make this an async generator
yield # type: ignore[misc] # pragma: no cover
# -- Request management (no-op) --
async def abort(
self, request_id: str | Iterable[str], internal: bool = False
) -> None:
pass
# -- Generation control (no-op) --
async def pause_generation(
self,
*,
mode: PauseMode = "abort",
wait_for_inflight_requests: bool | None = None,
clear_cache: bool = True,
) -> None:
pass
async def resume_generation(self) -> None:
pass
async def is_paused(self) -> bool:
return False
async def encode(
self,
prompt: PromptType | ProcessorInputs,
pooling_params: PoolingParams,
request_id: str,
lora_request: LoRARequest | None = None,
trace_headers: Mapping[str, str] | None = None,
priority: int = 0,
tokenization_kwargs: dict[str, Any] | None = None,
reasoning_ended: bool | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
raise NotImplementedError(
"LaunchEngineClient does not support inference. "
"Use vllm serve for encoding requests."
)
yield # type: ignore[misc] # pragma: no cover
# -- Observability (no-op / defaults) --
async def is_tracing_enabled(self) -> bool:
return False
async def do_log_stats(self) -> None:
pass
async def check_health(self) -> None:
pass
async def start_profile(self) -> None:
pass
async def stop_profile(self) -> None:
pass
# -- Cache management (no-op) --
async def reset_mm_cache(self) -> None:
pass
async def reset_prefix_cache(
self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
return True
async def reset_encoder_cache(self) -> None:
pass
# -- Power management (no-op) --
async def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
pass
async def wake_up(self, tags: list[str] | None = None) -> None:
pass
async def is_sleeping(self) -> bool:
return False
# -- LoRA (not supported) --
async def add_lora(self, lora_request: LoRARequest) -> bool:
return False
# -- Status properties --
@property
def is_running(self) -> bool:
return True
@property
def is_stopped(self) -> bool:
return False
@property
def errored(self) -> bool:
return False
@property
def dead_error(self) -> BaseException:
return RuntimeError("LaunchEngineClient does not support inference")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment