"vllm/vscode:/vscode.git/clone" did not exist on "ea6102b85da808b23055912391977f43fbe3f227"
Unverified Commit 480b41d1 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat: Python frontend / ingress node (#1912)

parent f00d700e
# Dynamo ingress / frontend node.
Usage: `python -m dynamo.ingress [--http-port <port>]`. Port defaults to 8080.
This runs an OpenAI compliant HTTP server, a pre-processor, and a router in a single process. Engines / workers are auto-discovered when they call `register_llm`.
Requires `etcd` and `nats-server -js`.
This is the same as `dynamo-run in=http out=dyn`.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.ingress.main import main
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Usage: `python -m dynamo.ingress [args]`
#
# Start a frontend node. This runs:
# - OpenAI HTTP server.
# - Auto-discovery: Watches etcd for engine/worker registration (via `register_llm`).
# - Pre-processor: Prompt templating and tokenization.
# - Router, defaulting to round-robin (TODO: Add flags to enable KV routing).
import argparse
import asyncio
import uvloop
from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
from dynamo.runtime import DistributedRuntime
def parse_args():
parser = argparse.ArgumentParser(
description="Dynamo Frontend: HTTP+Pre-processor+Router",
formatter_class=argparse.RawTextHelpFormatter, # To preserve multi-line help formatting
)
parser.add_argument(
"--kv-cache-block-size", type=int, help="KV cache block size (u32)."
)
parser.add_argument(
"--http-port", type=int, default=8080, help="HTTP port for the engine (u16)."
)
flags = parser.parse_args()
kwargs = {"http_port": flags.http_port}
if flags.kv_cache_block_size is not None:
kwargs["kv_cache_block_size"] = flags.kv_cache_block_size
return kwargs
async def async_main():
runtime = DistributedRuntime(asyncio.get_running_loop(), False)
flags = parse_args()
# out=dyn
e = EntrypointArgs(EngineType.Dynamic, **flags)
engine = await make_engine(runtime, e)
# in=http
try:
await run_input(runtime, "http", engine)
except asyncio.exceptions.CancelledError:
pass
def main():
uvloop.run(async_main())
if __name__ == "__main__":
main()
......@@ -81,7 +81,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo"]
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo"]
# This section is for including the binaries in the wheel package
# but doesn't make them executable scripts in the venv bin directory
......
......@@ -285,7 +285,6 @@ class DynamoServeProcess(ManagedProcess):
(f"http://localhost:{port}/v1/models", self._check_model)
]
health_check_ports = [port]
env = None
self.port = port
self.graph = graph
......@@ -305,7 +304,6 @@ class DynamoServeProcess(ManagedProcess):
"from multiprocessing.spawn",
],
log_dir=request.node.name,
env=env, # Pass the environment variables
)
def _check_model(self, response):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment