Commit 602352ce authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

chore: rename dynamo (#44)


Co-authored-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent ecf53ce2
......@@ -32,7 +32,7 @@ from tensorrt_llm.serve.openai_protocol import (
CompletionStreamResponse,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
logger.set_level("debug")
......@@ -45,7 +45,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
def __init__(self, engine_config: LLMAPIConfig):
super().__init__(engine_config)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, request):
if self._llm_engine is None:
raise RuntimeError("Engine not initialized")
......@@ -93,7 +93,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
except Exception as e:
raise RuntimeError("Failed to generate: " + str(e))
@dynemo_endpoint(CompletionRequest, CompletionStreamResponse)
@dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completion(self, request):
if self._llm_engine is None:
raise RuntimeError("Engine not initialized")
......@@ -140,13 +140,13 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
raise RuntimeError("Failed to generate: " + str(e))
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_config: LLMAPIConfig):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynemo").component("tensorrt-llm")
component = runtime.namespace("dynamo").component("tensorrt-llm")
await component.create_service()
completions_endpoint = component.endpoint("completions")
......
......@@ -63,7 +63,7 @@ By default the server will run on port 8080.
Add model to the server:
```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.vllm.generate
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.generate
```
##### Example Output
......@@ -71,7 +71,7 @@ llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.vllm
+------------+------------------------------------------+-----------+-----------+----------+
| MODEL TYPE | MODEL NAME | NAMESPACE | COMPONENT | ENDPOINT |
+------------+------------------------------------------+-----------+-----------+----------+
| chat | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | dynemo | vllm | generate |
| chat | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | dynamo | vllm | generate |
+------------+------------------------------------------+-----------+-----------+----------+
```
......@@ -83,7 +83,7 @@ In a separate terminal run the vllm worker:
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
# Launch worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -116,7 +116,7 @@ This deployment option splits the model serving across prefill and decode worker
**Terminal 1 - Prefill Worker:**
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
# Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -126,7 +126,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat
--enforce-eager \
--tensor-parallel-size 1 \
--kv-transfer-config \
'{"kv_connector":"DynemoNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
'{"kv_connector":"DynamoNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
```
##### Example Output
......@@ -142,7 +142,7 @@ INFO 03-02 05:59:47 llm_engine.py:476] init engine (profile, create kv cache, wa
**Terminal 2 - Decode Worker:**
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
# Launch decode worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -152,7 +152,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg
--enforce-eager \
--tensor-parallel-size 2 \
--kv-transfer-config \
'{"kv_connector":"DynemoNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
'{"kv_connector":"DynamoNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
```
The disaggregated deployment utilizes separate GPUs for prefill and decode operations, allowing for optimized resource allocation and improved performance. For more details on the disaggregated deployment, please refer to the [vLLM documentation](https://docs.vllm.ai/en/latest/features/disagg_prefill.html).
......@@ -254,7 +254,7 @@ tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {}
**Terminal 1 - Router:**
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
# Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -270,7 +270,7 @@ You can choose only the prefix strategy for now:
**Terminal 2 - Processor:**
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
# Processor must take the same args as the worker
# This is temporary until we communicate the ModelDeploymentCard over etcd
......@@ -286,7 +286,7 @@ RUST_LOG=info python3 -m kv_router.processor \
**Terminal 3 and 4 - Workers:**
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
# Launch Worker 1 and Worker 2 with same command
cd /workspace/examples/python_rs/llm/vllm
......@@ -304,7 +304,7 @@ Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens
**Terminal 5 - Client:**
Don't forget to add the model to the server:
```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.process.chat/completions
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.process.chat/completions
```
```bash
......@@ -351,7 +351,7 @@ Run following commands in 4 terminals:
**Terminal 1 - vLLM Worker:**
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
......@@ -361,7 +361,7 @@ RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Dis
```bash
# Activate virtual environment
source /opt/dynemo/venv/bin/activate
source /opt/dynamo/venv/bin/activate
cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m preprocessor.processor --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
......@@ -377,7 +377,7 @@ By default the server will run on port 8080.
Add model to the server:
```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B dynemo.preprocessor.generate
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B dynamo.preprocessor.generate
```
**Terminal 4 - client**
......
......@@ -19,12 +19,12 @@ import asyncio
import uvloop
from dynemo.runtime import DistributedRuntime, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_worker
from .protocol import Request
@dynemo_worker()
@dynamo_worker()
async def worker(
runtime: DistributedRuntime,
component: str,
......@@ -36,7 +36,7 @@ async def worker(
Instantiate a `backend` client and call the `generate` endpoint
"""
# get endpoint
endpoint = runtime.namespace("dynemo").component(component).endpoint("generate")
endpoint = runtime.namespace("dynamo").component(component).endpoint("generate")
# create client
client = await endpoint.client()
......
......@@ -31,7 +31,7 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.logger import logger as vllm_logger
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
......@@ -54,7 +54,7 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
self.kv_rank = self.kv_transfer_config.kv_rank
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, raw_request):
if self.engine_client is None:
await self.initialize()
......@@ -103,17 +103,17 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
await prefill_output
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynemo").component("vllm")
component = runtime.namespace("dynamo").component("vllm")
await component.create_service()
prefill = (
await runtime.namespace("dynemo")
await runtime.namespace("dynamo")
.component("prefill")
.endpoint("generate")
.client()
......
......@@ -24,7 +24,7 @@ from common.protocol import PrefillRequest, PrefillResponse
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmPrefillEngine(BaseVllmEngine):
......@@ -45,7 +45,7 @@ class VllmPrefillEngine(BaseVllmEngine):
self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
self.kv_rank = self.kv_transfer_config.kv_rank
@dynemo_endpoint(PrefillRequest, PrefillResponse)
@dynamo_endpoint(PrefillRequest, PrefillResponse)
async def generate(self, request):
if self.engine_client is None:
await self.initialize()
......@@ -62,13 +62,13 @@ class VllmPrefillEngine(BaseVllmEngine):
yield True
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynemo").component("prefill")
component = runtime.namespace("dynamo").component("prefill")
await component.create_service()
async with VllmPrefillEngine(engine_args) as prefill_engine:
......
......@@ -34,7 +34,7 @@ from vllm.logger import logger as vllm_logger
from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer
from dynemo.runtime import Client, DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.runtime import Client, DistributedRuntime, dynamo_endpoint, dynamo_worker
class RequestType(Enum):
......@@ -157,38 +157,38 @@ class Processor(ProcessMixIn):
f"Request type {request_type} not implemented"
)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, raw_request):
async for response in self._generate(raw_request, RequestType.CHAT):
yield response
@dynemo_endpoint(CompletionRequest, CompletionStreamResponse)
@dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completions(self, raw_request):
async for response in self._generate(raw_request, RequestType.COMPLETION):
yield response
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Set up clients to the router and workers.
Serve the dynemo.process.chat/completions endpoint.
Serve the dynamo.process.chat/completions endpoint.
"""
workers_client = (
await runtime.namespace("dynemo")
await runtime.namespace("dynamo")
.component("vllm")
.endpoint("generate")
.client()
)
router_client = (
await runtime.namespace("dynemo")
await runtime.namespace("dynamo")
.component("router")
.endpoint("generate")
.client()
)
preprocess_component = runtime.namespace("dynemo").component("process")
preprocess_component = runtime.namespace("dynamo").component("process")
await preprocess_component.create_service()
chat_endpoint = preprocess_component.endpoint("chat/completions")
......
......@@ -23,8 +23,8 @@ import uvloop
from common.protocol import Tokens
from vllm.logger import logger as vllm_logger
from dynemo.llm import KvIndexer, KvMetricsAggregator, KvRouter
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.llm import KvIndexer, KvMetricsAggregator, KvRouter
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
WorkerId = str
......@@ -51,7 +51,7 @@ class Router:
self.router = router
self.routing_strategy = routing_strategy
@dynemo_endpoint(Tokens, WorkerId)
@dynamo_endpoint(Tokens, WorkerId)
async def generate(self, request) -> AsyncIterator[WorkerId]:
lora_id = 0
worker_id = None
......@@ -108,7 +108,7 @@ class CustomRouter:
)
return current_best[0]
@dynemo_endpoint(Tokens, WorkerId)
@dynamo_endpoint(Tokens, WorkerId)
async def generate(self, request) -> AsyncIterator[WorkerId]:
lora_id = 0
worker_id = ""
......@@ -132,14 +132,14 @@ class CustomRouter:
yield str(worker_id)
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, args: Namespace):
"""
Set up the worker clients.
Serve the dynemo.router.generate endpoint.
Serve the dynamo.router.generate endpoint.
"""
workers_client = (
await runtime.namespace("dynemo")
await runtime.namespace("dynamo")
.component("vllm")
.endpoint("generate")
.client()
......@@ -164,10 +164,10 @@ async def worker(runtime: DistributedRuntime, args: Namespace):
+ "\n".join(f"id: {id}" for id in workers_client.endpoint_ids())
)
kv_listener = runtime.namespace("dynemo").component("vllm")
kv_listener = runtime.namespace("dynamo").component("vllm")
await kv_listener.create_service()
router_component = runtime.namespace("dynemo").component("router")
router_component = runtime.namespace("dynamo").component("router")
await router_component.create_service()
endpoint = router_component.endpoint("generate")
......
......@@ -25,8 +25,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger
from vllm.sampling_params import RequestOutputKind
from dynemo.llm import KvMetricsPublisher
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.llm import KvMetricsPublisher
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
vllm_logger.info(f"VLLM_KV_CAPI_PATH: {os.environ['VLLM_KV_CAPI_PATH']}")
......@@ -48,7 +48,7 @@ class VllmEngine(BaseVllmEngine):
assert self.engine_client is not None, "engine_client was not initialized"
self.engine_client.set_metrics_publisher(self.metrics_publisher)
@dynemo_endpoint(vLLMGenerateRequest, MyRequestOutput)
@dynamo_endpoint(vLLMGenerateRequest, MyRequestOutput)
async def generate(self, request) -> AsyncIterator:
assert (
self.engine_client is not None
......@@ -73,12 +73,12 @@ class VllmEngine(BaseVllmEngine):
).model_dump_json()
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Serve the dynemo.vllm.generate endpoint.
Serve the dynamo.vllm.generate endpoint.
"""
worker_component = runtime.namespace("dynemo").component("vllm")
worker_component = runtime.namespace("dynamo").component("vllm")
await worker_component.create_service()
worker_endpoint = worker_component.endpoint("generate")
......@@ -87,7 +87,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
VLLM_KV_NAMESPACE = "dynemo"
VLLM_KV_NAMESPACE = "dynamo"
os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
VLLM_KV_COMPONENT = "vllm"
......
......@@ -28,7 +28,7 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.logger import logger as vllm_logger
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmEngine(BaseVllmEngine, ProcessMixIn):
......@@ -39,7 +39,7 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn):
def __init__(self, engine_args: AsyncEngineArgs):
super().__init__(engine_args)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, raw_request):
if self.engine_client is None:
await self.initialize()
......@@ -71,13 +71,13 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn):
yield response
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynemo").component("vllm")
component = runtime.namespace("dynamo").component("vllm")
await component.create_service()
endpoint = component.endpoint("generate")
......
......@@ -19,27 +19,27 @@ import asyncio
import uvloop
from preprocessor.common import parse_vllm_args
from dynemo.runtime import (
from dynamo.runtime import (
DistributedRuntime,
ModelDeploymentCard,
OAIChatPreprocessor,
dynemo_worker,
dynamo_worker,
)
uvloop.install()
@dynemo_worker()
@dynamo_worker()
async def preprocessor(runtime: DistributedRuntime, model_name: str, model_path: str):
# create model deployment card
mdc = await ModelDeploymentCard.from_local_path(model_path, model_name)
# create preprocessor endpoint
component = runtime.namespace("dynemo").component("preprocessor")
component = runtime.namespace("dynamo").component("preprocessor")
await component.create_service()
endpoint = component.endpoint("generate")
# create backend endpoint
backend = runtime.namespace("dynemo").component("backend").endpoint("generate")
backend = runtime.namespace("dynamo").component("backend").endpoint("generate")
# start preprocessor service with next backend
chat = OAIChatPreprocessor(mdc, endpoint, next=backend)
......
......@@ -28,12 +28,12 @@ from vllm.entrypoints.openai.api_server import (
)
from vllm.outputs import CompletionOutput
from dynemo.runtime import (
from dynamo.runtime import (
Backend,
DistributedRuntime,
ModelDeploymentCard,
dynemo_endpoint,
dynemo_worker,
dynamo_endpoint,
dynamo_worker,
)
finish_reason_map = {
......@@ -107,7 +107,7 @@ class VllmEngine(AsyncContextDecorator):
}
return SamplingParams(**sampling_params)
@dynemo_endpoint(Any, CompletionOutput)
@dynamo_endpoint(Any, CompletionOutput)
async def generate(self, request):
state = DeltaState()
request_id = str(uuid.uuid4())
......@@ -122,13 +122,13 @@ class VllmEngine(AsyncContextDecorator):
yield self.to_backend_output(choice, delta_token_ids)
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: NvAsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynemo").component("backend")
component = runtime.namespace("dynamo").component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
......
......@@ -21,12 +21,12 @@
if [ $# -gt 2 ]; then
echo "Usage: $0 [model_name] [endpoint_name]"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: dynemo.vllm.generate)"
echo "Optional: endpoint_name (default: dynamo.vllm.generate)"
exit 1
fi
MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${2:-"dynemo.vllm.generate"}
ENDPOINT_NAME=${2:-"dynamo.vllm.generate"}
SESSION_NAME="vllm_disagg"
WORKDIR="$(dirname $0)/.."
INIT_CMD="cd $WORKDIR"
......
......@@ -22,14 +22,14 @@ if [ $# -lt 2 ]; then
echo "Usage: $0 <number_of_workers> <routing_strategy> [model_name] [endpoint_name]"
echo "Error: Must specify at least number of workers and routing strategy"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: dynemo.process.chat/completions)"
echo "Optional: endpoint_name (default: dynamo.process.chat/completions)"
exit 1
fi
NUM_WORKERS=$1
ROUTING_STRATEGY=$2
MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${4:-"dynemo.process.chat/completions"}
ENDPOINT_NAME=${4:-"dynamo.process.chat/completions"}
VALID_STRATEGIES=("prefix")
SESSION_NAME="v"
WORKDIR="/workspace/examples/python_rs/llm/vllm"
......
......@@ -26,12 +26,12 @@ Users/Clients (HTTP)
┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate)
└─────────────┘
│ dynemo/distributed-runtime
│ dynamo/runtime
┌─────────────┐
│ Middle │
└─────────────┘
│ dynemo/distributed-runtime
│ dynamo/runtime
┌─────────────┐
│ Backend │
......@@ -45,7 +45,7 @@ Users/Clients (HTTP)
```bash
cd /workspace/examples/python_rs/llm/vllm
dynemo-sdk serve sdk_basic_service.basic:Frontend
dynamo-sdk serve sdk_basic_service.basic:Frontend
```
2. Send request to frontend using curl -
......@@ -58,4 +58,4 @@ curl -X 'POST' \
-d '{
"text": "test"
}'
```
\ No newline at end of file
```
......@@ -15,7 +15,7 @@
from pydantic import BaseModel
from dynemo.sdk import api, depends, dynemo_endpoint, service
from dynamo.sdk import api, depends, dynamo_endpoint, service
"""
Pipeline Architecture:
......@@ -26,12 +26,12 @@ Users/Clients (HTTP)
┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate)
└─────────────┘
│ dynemo/distributed-runtime
│ dynamo/runtime
┌─────────────┐
│ Middle │
└─────────────┘
│ dynemo/distributed-runtime
│ dynamo/runtime
┌─────────────┐
│ Backend │
......@@ -50,7 +50,7 @@ class ResponseType(BaseModel):
@service(
resources={"cpu": "2"},
traffic={"timeout": 30},
dynemo={
dynamo={
"enabled": True,
"namespace": "inference",
},
......@@ -60,9 +60,10 @@ class Backend:
def __init__(self) -> None:
print("Starting backend")
@dynemo_endpoint()
@dynamo_endpoint()
async def generate(self, req: RequestType):
"""Generate tokens."""
print("here2")
req_text = req.text
print(f"Backend received: {req_text}")
text = f"{req_text}-back"
......@@ -73,7 +74,7 @@ class Backend:
@service(
resources={"cpu": "2"},
traffic={"timeout": 30},
dynemo={"enabled": True, "namespace": "inference"},
dynamo={"enabled": True, "namespace": "inference"},
)
class Middle:
backend = depends(Backend)
......@@ -81,13 +82,14 @@ class Middle:
def __init__(self) -> None:
print("Starting middle")
@dynemo_endpoint()
@dynamo_endpoint()
async def generate(self, req: RequestType):
"""Forward requests to backend."""
req_text = req.text
print(f"Middle received: {req_text}")
text = f"{req_text}-mid"
next_request = RequestType(text=text).model_dump_json()
print("here5")
async for response in self.backend.generate(next_request):
print(f"Middle received response: {response}")
yield f"Middle: {response}"
......
......@@ -33,7 +33,7 @@ All of the commands below are run inside the same container.
## Run deployment
Add model to dynemo and start http server.
Add model to dynamo and start http server.
In terminal 0:
```
......@@ -65,7 +65,7 @@ CUDA_VISIBLE_DEVICES=0 python prefill_worker.py \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--enforce-eager \
--kv-transfer-config \
'{"kv_connector":"DynemoNixlConnector"}'
'{"kv_connector":"DynamoNixlConnector"}'
```
In terminal 2:
......@@ -78,7 +78,7 @@ CUDA_VISIBLE_DEVICES=1,2 python3 worker.py \
--enforce-eager \
--tensor-parallel-size 2 \
--kv-transfer-config \
'{"kv_connector":"DynemoNixlConnector"}'
'{"kv_connector":"DynamoNixlConnector"}'
```
......@@ -157,7 +157,7 @@ rm -r /tmp/nixl
- [x] Zero copy
- [x] Conditional remote prefill
- [x] Manual example with tp > 1
- [x] Run on dynemo distributed runtime
- [x] Run on dynamo distributed runtime
- [x] add oai http endpoint
- [x] Sample only on decode, do note return remote prefill response
- [x] Check if all transfers finished before moving to decode
......
......@@ -22,7 +22,7 @@ from vllm.distributed.device_communicators.nixl import NixlMetadata
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
from dynemo.runtime import DistributedRuntime
from dynamo.runtime import DistributedRuntime
METADATA_DIR = "/tmp/nixl"
......
......@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.api_server import (
from vllm.inputs.data import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynemo.runtime import DistributedRuntime, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_worker
class RequestHandler:
......@@ -71,7 +71,7 @@ class RequestHandler:
yield
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
component = runtime.namespace("test-nixl").component("prefill")
await component.create_service()
......
......@@ -33,7 +33,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class RequestHandler:
......@@ -83,7 +83,7 @@ class RequestHandler:
return callback
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, request):
if not self.initialized:
await self.init()
......@@ -109,7 +109,7 @@ class RequestHandler:
yield response
@dynemo_worker()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
component = runtime.namespace("test-nixl").component("vllm")
await component.create_service()
......
......@@ -956,7 +956,7 @@ dependencies = [
]
[[package]]
name = "dynemo-llm"
name = "dynamo-llm"
version = "0.2.1"
dependencies = [
"anyhow",
......@@ -971,7 +971,7 @@ dependencies = [
"chrono",
"cmake",
"derive_builder",
"dynemo-runtime",
"dynamo-runtime",
"either",
"erased-serde",
"futures",
......@@ -1005,7 +1005,7 @@ dependencies = [
]
[[package]]
name = "dynemo-runtime"
name = "dynamo-runtime"
version = "0.2.1"
dependencies = [
"anyhow",
......@@ -1463,7 +1463,7 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
name = "hello_world"
version = "0.2.0"
dependencies = [
"dynemo-runtime",
"dynamo-runtime",
]
[[package]]
......@@ -1488,8 +1488,8 @@ name = "http"
version = "0.2.0"
dependencies = [
"clap",
"dynemo-llm",
"dynemo-runtime",
"dynamo-llm",
"dynamo-runtime",
"serde",
"serde_json",
"tokio",
......@@ -1988,8 +1988,8 @@ name = "llmctl"
version = "0.2.0"
dependencies = [
"clap",
"dynemo-llm",
"dynemo-runtime",
"dynamo-llm",
"dynamo-runtime",
"serde",
"serde_json",
"tabled",
......@@ -3390,7 +3390,7 @@ dependencies = [
name = "service_metrics"
version = "0.2.0"
dependencies = [
"dynemo-runtime",
"dynamo-runtime",
"futures",
"serde",
"serde_json",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment