Commit 5bcdb734 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: rename vllm_nixl to vllm and make default (#100)

parent a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import os
from typing import AsyncIterator
import uvloop
from common.base_engine import BaseVllmEngine
from common.parser import parse_vllm_args
from common.protocol import MyRequestOutput, vLLMGenerateRequest
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger
from vllm.sampling_params import RequestOutputKind
from dynamo.llm import KvMetricsPublisher
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
vllm_logger.info(f"VLLM_KV_CAPI_PATH: {os.environ['VLLM_KV_CAPI_PATH']}")
class VllmEngine(BaseVllmEngine):
"""
vLLM Inference Engine
"""
def __init__(
self, engine_args: AsyncEngineArgs, metrics_publisher: KvMetricsPublisher
):
self.metrics_publisher = metrics_publisher
self.engine_args = engine_args
super().__init__(engine_args)
async def initialize(self):
await super().initialize()
assert self.engine_client is not None, "engine_client was not initialized"
self.engine_client.set_metrics_publisher(self.metrics_publisher)
@dynamo_endpoint(vLLMGenerateRequest, MyRequestOutput)
async def generate(self, request) -> AsyncIterator:
assert (
self.engine_client is not None
), "engine_client was not initialized, must call initialize() first"
sampling_params = request.sampling_params
# rust HTTP requires Delta streaming
sampling_params.output_kind = RequestOutputKind.DELTA
async for response in self.engine_client.generate(
request.engine_prompt, sampling_params, request.request_id
):
# MyRequestOutput takes care of serializing the response as
# vLLM's RequestOutput is not serializable by default
yield MyRequestOutput(
request_id=response.request_id,
prompt=response.prompt,
prompt_token_ids=response.prompt_token_ids,
prompt_logprobs=response.prompt_logprobs,
outputs=response.outputs,
finished=response.finished,
).model_dump_json()
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Serve the dynamo.vllm.generate endpoint.
"""
worker_component = runtime.namespace("dynamo").component("vllm")
await worker_component.create_service()
worker_endpoint = worker_component.endpoint("generate")
VLLM_WORKER_ID = worker_endpoint.lease_id()
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
VLLM_KV_NAMESPACE = "dynamo"
os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
VLLM_KV_COMPONENT = "vllm"
os.environ["VLLM_KV_COMPONENT"] = str(VLLM_KV_COMPONENT)
metrics_publisher = KvMetricsPublisher()
vllm_engine = VllmEngine(engine_args, metrics_publisher)
await vllm_engine.initialize()
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
metrics_publisher.publish(
0, # request_active_slots
1024, # request_total_slots
0, # kv_active_blocks
1024, # kv_total_blocks
0, # num_requests_waiting
0.0, # gpu_cache_usage_perc
0.0, # gpu_prefix_cache_hit_rate
)
await asyncio.gather(
worker_endpoint.serve_endpoint(vllm_engine.generate),
metrics_publisher.create_endpoint(worker_component),
)
if __name__ == "__main__":
uvloop.install()
engine_args = parse_vllm_args()
asyncio.run(worker(engine_args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import json
from typing import AsyncGenerator, AsyncIterator
import uvloop
from common.parser import parse_vllm_args
from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionStreamResponse,
CompletionRequest,
CompletionResponse,
CompletionStreamResponse,
ErrorResponse,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmEngine:
def __init__(
self, engine_client: AsyncIterator[EngineClient], model_config: ModelConfig
):
self.engine_client = engine_client
self.model_config = model_config
# Ensure served_model_name matches the openai model name
# Use --served-model-name to explicitly set this or it will fallback to --model
models = OpenAIServingModels(
engine_client=engine_client,
model_config=model_config,
base_model_paths=[
BaseModelPath(
name=model_config.served_model_name,
model_path=model_config.model,
)
],
)
self.chat_serving = OpenAIServingChat(
engine_client=self.engine_client,
model_config=self.model_config,
models=models,
response_role="assistant",
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
self.completion_serving = OpenAIServingCompletion(
engine_client=self.engine_client,
model_config=self.model_config,
models=models,
request_logger=None,
)
@dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, request):
result = await self.chat_serving.create_chat_completion(request)
if isinstance(result, AsyncGenerator):
async for raw_response in result:
if raw_response.startswith("data: [DONE]"):
break
response = json.loads(raw_response.lstrip("data: "))
yield response
# We should always be streaming so should never get here
elif isinstance(result, ChatCompletionResponse):
raise RuntimeError("ChatCompletionResponse support not implemented")
elif isinstance(result, ErrorResponse):
error = result.dict()
raise RuntimeError(
f"Error {error['code']}: {error['message']} "
f"(type: {error['type']}, param: {error['param']})"
)
else:
raise TypeError(f"Unexpected response type: {type(result)}")
@dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completions(self, request):
result = await self.completion_serving.create_completion(request)
if isinstance(result, AsyncGenerator):
async for raw_response in result:
if raw_response.startswith("data: [DONE]"):
break
response = json.loads(raw_response.lstrip("data: "))
yield response
# We should always be streaming so should never get here
elif isinstance(result, CompletionResponse):
raise RuntimeError("CompletionResponse support not implemented")
elif isinstance(result, ErrorResponse):
error = result.dict()
raise RuntimeError(
f"Error {error['code']}: {error['message']} "
f"(type: {error['type']}, param: {error['param']})"
)
else:
raise TypeError(f"Unexpected response type: {type(result)}")
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynamo").component("vllm")
await component.create_service()
chat_endpoint = component.endpoint("chat/completions")
completions_endpoint = component.endpoint("completions")
async with build_async_engine_client_from_engine_args(engine_args) as engine_client:
model_config = await engine_client.get_model_config()
engine = VllmEngine(engine_client, model_config)
await asyncio.gather(
chat_endpoint.serve_endpoint(engine.generate_chat),
completions_endpoint.serve_endpoint(engine.generate_completions),
)
if __name__ == "__main__":
uvloop.install()
engine_args = parse_vllm_args()
asyncio.run(worker(engine_args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from dataclasses import dataclass, field
from huggingface_hub import snapshot_download
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
@dataclass
class NvAsyncEngineArgs(AsyncEngineArgs):
model_path: str = field(default="")
def parse_vllm_args() -> NvAsyncEngineArgs:
parser = FlexibleArgumentParser()
parser = AsyncEngineArgs.add_cli_args(parser)
parser.add_argument(
"--model-path",
type=str,
default="",
)
args = parser.parse_args()
if args.model_path == "":
if os.environ.get("HF_TOKEN"):
args.model_path = snapshot_download(args.model)
else:
raise ValueError(
"Please set HF_TOKEN environment variable "
"or pass --model-path to load the model"
)
return NvAsyncEngineArgs.from_cli_args(args)
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
PORT=8080
# list models
echo "\n\n### Listing models"
curl http://localhost:$PORT/v1/models
# create completion
echo "\n\n### Creating completions"
curl -X 'POST' \
"http://localhost:$PORT/v1/chat/completions" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"messages": [
{
"role":"user",
"content":"what is deep learning?"
}
],
"max_tokens": 64,
"stream": true,
"temperature": 0.7,
"top_p": 0.9,
"frequency_penalty": 0.1,
"presence_penalty": 0.2,
"top_k": 5
}'
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import uvloop
from preprocessor.common import parse_vllm_args
from dynamo.runtime import (
DistributedRuntime,
ModelDeploymentCard,
OAIChatPreprocessor,
dynamo_worker,
)
uvloop.install()
@dynamo_worker()
async def preprocessor(runtime: DistributedRuntime, model_name: str, model_path: str):
# create model deployment card
mdc = await ModelDeploymentCard.from_local_path(model_path, model_name)
# create preprocessor endpoint
component = runtime.namespace("dynamo").component("preprocessor")
await component.create_service()
endpoint = component.endpoint("generate")
# create backend endpoint
backend = runtime.namespace("dynamo").component("backend").endpoint("generate")
# start preprocessor service with next backend
chat = OAIChatPreprocessor(mdc, endpoint, next=backend)
await chat.start()
if __name__ == "__main__":
args = parse_vllm_args()
asyncio.run(preprocessor(args.model, args.model_path))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import inspect
import uuid
from contextlib import AsyncContextDecorator
from typing import Any
import uvloop
from preprocessor.common import NvAsyncEngineArgs, parse_vllm_args
from vllm import SamplingParams
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.outputs import CompletionOutput
from dynamo.runtime import (
Backend,
DistributedRuntime,
ModelDeploymentCard,
dynamo_endpoint,
dynamo_worker,
)
finish_reason_map = {
None: None,
"stop": "stop",
"abort": "cancelled",
"length": "length",
"error": "error",
}
class DeltaState:
"""
The vLLM AsyncEngine returns the full internal state of each slot per forward pass.
The OpenAI ChatCompletionResponseDelta object only requires the delta, so this object
is used to track the state of the last forward pass to calculate the delta.
"""
def __init__(self):
self.token_ids = None
self.last_token_count = 0
def delta(self, choice):
self.token_ids = choice.token_ids
tokens_produced = len(choice.token_ids) - self.last_token_count
self.last_token_count = len(choice.token_ids)
return choice.token_ids[-tokens_produced:]
class VllmEngine(AsyncContextDecorator):
"""
Request handler for the generate endpoint
"""
def __init__(self, engine_args: NvAsyncEngineArgs, mdc: ModelDeploymentCard):
self.mdc = mdc
self.engine_args = engine_args
print("vllm backend started")
async def __aenter__(self):
await self.async_init()
return self
async def __aexit__(self, exc_type, exc_value, traceback):
print("vllm backend exited")
async def async_init(self):
self._engine_context = build_async_engine_client_from_engine_args(
self.engine_args, False
)
if self._engine_context is not None:
self.engine_client = await self._engine_context.__aenter__()
else:
raise RuntimeError("Failed to initialize engine client")
def to_backend_output(self, response: CompletionOutput, delta_token_ids: list[int]):
return {
"token_ids": delta_token_ids,
"tokens": [],
"finish_reason": finish_reason_map.get(response.finish_reason, "stop"),
"cum_log_probs": response.cumulative_logprob,
"text": None,
}
def to_sampling_params(self, request) -> SamplingParams:
sampling_params_names = inspect.signature(SamplingParams).parameters.keys()
sampling_params = {
k: v
for k, v in request.get("sampling_options", {}).items()
if k in sampling_params_names and v is not None
}
return SamplingParams(**sampling_params)
@dynamo_endpoint(Any, CompletionOutput)
async def generate(self, request):
state = DeltaState()
request_id = str(uuid.uuid4())
sampling_params = self.to_sampling_params(request)
inputs = {"prompt_token_ids": request["token_ids"]}
stream = self.engine_client.generate(
inputs, sampling_params, request_id=request_id
)
async for request_output in stream:
for choice in request_output.outputs:
delta_token_ids = state.delta(choice)
yield self.to_backend_output(choice, delta_token_ids)
@dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: NvAsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("dynamo").component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
mdc = await ModelDeploymentCard.from_local_path(
engine_args.model_path, engine_args.model
)
async with VllmEngine(engine_args, mdc) as engine:
backend = Backend(mdc, endpoint)
await backend.start(engine.generate)
if __name__ == "__main__":
uvloop.install()
engine_args = parse_vllm_args()
asyncio.run(worker(engine_args))
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# LIMITATIONS:
# - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
# - Must use a single node
set -xe
if [ $# -lt 2 ]; then
echo "Usage: $0 <number_of_workers> <routing_strategy> <log_dir_name> [model_name] [model_args] [chat_endpoint_name] [completions_endpoint_name]"
echo "Error: Must specify at least number of workers and routing strategy and log_dir_name"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: model_args (quoted string with model arguments)"
echo "Optional: chat_endpoint_name (default: dynamo.vllm.chat/completions)"
echo "Optional: completions_endpoint_name (default: dynamo.vllm.completions)"
exit 1
fi
# If using Cache, can set this to not check HF
export HF_HUB_OFFLINE=1
export GLOO_SOCKET_IFNAME=lo
# Required for Qwen2.5 R1 Distilled in order to set --block-size 64 and --kv-cache-dtype fp8
uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.5/
export VLLM_ATTENTION_BACKEND=FLASHINFER
NUM_WORKERS=$1
ROUTING_STRATEGY=$2
LOG_DIR_NAME=$3
MODEL_NAME=${4:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
CUSTOM_MODEL_ARGS=$5
CHAT_ENDPOINT_NAME=${6:-"dynamo.vllm.chat/completions"}
COMPLETIONS_ENDPOINT_NAME=${7:-"dynamo.vllm.completions"}
VALID_STRATEGIES=("random")
SESSION_NAME="v"
WORKDIR="/workspace/examples/python_rs/llm/vllm"
INIT_CMD="source /opt/dynamo/venv/bin/activate && cd $WORKDIR"
# Default model args
DEFAULT_MODEL_ARGS="--model $MODEL_NAME \
--tokenizer $MODEL_NAME \
--enable-prefix-caching \
--block-size 64"
# Use custom model args if provided, otherwise use default
if [ -n "$CUSTOM_MODEL_ARGS" ]; then
MODEL_ARGS="$CUSTOM_MODEL_ARGS"
echo "Using custom model arguments"
else
MODEL_ARGS="$DEFAULT_MODEL_ARGS"
echo "Using default model arguments"
fi
# Create logs directory if it doesn't exist
LOGS_DIR="/logs/$LOG_DIR_NAME"
mkdir -p $LOGS_DIR
chmod -R 775 $LOGS_DIR
if [[ ! " ${VALID_STRATEGIES[@]} " =~ " ${ROUTING_STRATEGY} " ]]; then
echo "Error: Invalid routing strategy. Must be one of: ${VALID_STRATEGIES[*]}"
exit 1
fi
########################################################
# HTTP Server
########################################################
HTTP_CMD="DYN_LOG=DEBUG http |& tee $LOGS_DIR/http.log"
tmux new-session -d -s "$SESSION_NAME-http"
tmux send-keys -t "$SESSION_NAME-http" "$INIT_CMD && $HTTP_CMD" C-m
########################################################
# LLMCTL
########################################################
LLMCTL_CMD="sleep 5 && \
llmctl http remove chat $MODEL_NAME && \
llmctl http remove completions $MODEL_NAME && \
llmctl http add chat $MODEL_NAME $CHAT_ENDPOINT_NAME && \
llmctl http add completions $MODEL_NAME $COMPLETIONS_ENDPOINT_NAME && \
llmctl http list |& tee $LOGS_DIR/llmctl.log"
tmux new-session -d -s "$SESSION_NAME-llmctl"
tmux send-keys -t "$SESSION_NAME-llmctl" "$INIT_CMD && $LLMCTL_CMD" C-m
########################################################
# Workers
########################################################
WORKER_CMD="RUST_LOG=info python3 -m monolith.worker $MODEL_ARGS"
for i in $(seq 1 $NUM_WORKERS); do
tmux new-session -d -s "$SESSION_NAME-$i"
done
for i in $(seq 1 $NUM_WORKERS); do
tmux send-keys -t "$SESSION_NAME-$i" "$INIT_CMD && CUDA_VISIBLE_DEVICES=$((i-1)) $WORKER_CMD |& tee $LOGS_DIR/worker-$i.log" C-m
done
\ No newline at end of file
#!/bin/bash -e
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# LIMITATIONS:
# - Must have at least 2 GPUs since CUDA_VISIBLE_DEVICES is hard-coded to 0 and 1
# - Must use a single node
if [ $# -gt 2 ]; then
echo "Usage: $0 [model_name] [endpoint_name]"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: dynamo.vllm.generate)"
exit 1
fi
MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${2:-"dynamo.vllm.generate"}
SESSION_NAME="vllm_disagg"
WORKDIR="$(dirname $0)/.."
INIT_CMD="cd $WORKDIR"
########################################################
# TMUX SESSION SETUP
########################################################
# Start new session
tmux new-session -d -s "$SESSION_NAME"
# Split into 4 equal panes
tmux split-window -h
tmux split-window -v
tmux select-pane -t 0
tmux split-window -v
########################################################
# HTTP Server
########################################################
HTTP_HOST="localhost"
HTTP_PORT=8080
HTTP_CMD="DYN_LOG=DEBUG http --host ${HTTP_HOST} --port ${HTTP_PORT}"
tmux select-pane -t 0
tmux send-keys "$INIT_CMD && $HTTP_CMD" C-m
########################################################
# LLMCTL
########################################################
LLMCTL_CMD="sleep 5 && llmctl http remove chat-model $MODEL_NAME && \
llmctl http add chat-model $MODEL_NAME $ENDPOINT_NAME && \
llmctl http list chat-model"
tmux select-pane -t 1
tmux send-keys "$INIT_CMD && $LLMCTL_CMD" C-m
CURL_CMD="curl ${HTTP_HOST}:${HTTP_PORT}/v1/chat/completions \
-H \"Content-Type: application/json\" \
-d '{
\"model\": \"$MODEL_NAME\",
\"messages\": [
{\"role\": \"user\", \"content\": \"What is the capital of France?\"}
],
\"stream\": true,
\"max_tokens\": 10
}'"
# Prepare a curl command for a quick test, but don't execute it since the server
# needs to spin up first.
tmux send-keys "$CURL_CMD"
########################################################
# Processor
########################################################
# skip
########################################################
# Router
########################################################
# skip
########################################################
# Prefill
########################################################
PREFILL_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 \
python3 -m disaggregated.prefill_worker \
--model $MODEL_NAME \
--gpu-memory-utilization 0.8 \
--enforce-eager \
--max-model-len 1000 \
--tensor-parallel-size 1 \
--kv-transfer-config \
'{\"kv_connector\":\"DynamoNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_rank\":0,\"kv_parallel_size\":2}'"
tmux select-pane -t 2
tmux send-keys "$INIT_CMD && $PREFILL_CMD" C-m
########################################################
# Decode
########################################################
DECODE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1 \
python3 -m disaggregated.decode_worker \
--model $MODEL_NAME \
--gpu-memory-utilization 0.8 \
--enforce-eager \
--max-model-len 1000 \
--tensor-parallel-size 1 \
--kv-transfer-config \
'{\"kv_connector\":\"DynamoNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_rank\":1,\"kv_parallel_size\":2}'"
tmux select-pane -t 3
tmux send-keys "$INIT_CMD && $DECODE_CMD" C-m
tmux attach-session -t "$SESSION_NAME"
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# LIMITATIONS:
# - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
# - Must use a single node
if [ $# -lt 3 ]; then
echo "Usage: $0 <number_of_workers> <log_dir_name> [model_name] [model_args] [chat_endpoint_name] [completions_endpoint_name]"
echo "Error: Must specify at least number of workers, log_dir_name"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: model_args (quoted string with model arguments)"
echo "Optional: chat_endpoint_name (default: dynamo.process.chat/completions)"
echo "Optional: completions_endpoint_name (default: dynamo.process.completions)"
exit 1
fi
# Uncomment if using Cache
# export HF_HUB_OFFLINE=1
# https://github.com/vllm-project/vllm/issues/10734#issuecomment-2507201353
# Fix for:torch.distributed.DistBackendError: File name too long
# export GLOO_SOCKET_IFNAME=lo
NUM_WORKERS=$1
LOG_DIR_NAME=$2
MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
CUSTOM_MODEL_ARGS=$4
CHAT_ENDPOINT_NAME=${5:-"dynamo.process.chat/completions"}
COMPLETIONS_ENDPOINT_NAME=${6:-"dynamo.process.completions"}
SESSION_NAME="v"
WORKDIR="/workspace/examples/python_rs/llm/vllm"
INIT_CMD="cd $WORKDIR"
# Default model args
DEFAULT_MODEL_ARGS="--model $MODEL_NAME \
--tokenizer $MODEL_NAME \
--enable-prefix-caching \
--block-size 64"
# Use custom model args if provided, otherwise use default
if [ -n "$CUSTOM_MODEL_ARGS" ]; then
MODEL_ARGS="$CUSTOM_MODEL_ARGS"
echo "Using custom model arguments"
else
MODEL_ARGS="$DEFAULT_MODEL_ARGS"
echo "Using default model arguments"
fi
# Create logs directory if it doesn't exist
LOGS_DIR="/logs/$LOG_DIR_NAME"
mkdir -p $LOGS_DIR
chmod -R 775 $LOGS_DIR
########################################################
# HTTP Server
########################################################
HTTP_CMD="DYN_LOG=DEBUG http |& tee $LOGS_DIR/http.log"
tmux new-session -d -s "$SESSION_NAME-http"
tmux send-keys -t "$SESSION_NAME-http" "$INIT_CMD && $HTTP_CMD" C-m
########################################################
# LLMCTL
########################################################
LLMCTL_CMD="sleep 5 && \
llmctl http remove chat $MODEL_NAME && \
llmctl http remove completions $MODEL_NAME && \
llmctl http add chat $MODEL_NAME $CHAT_ENDPOINT_NAME && \
llmctl http add completions $MODEL_NAME $COMPLETIONS_ENDPOINT_NAME && \
llmctl http list |& tee $LOGS_DIR/llmctl.log"
tmux new-session -d -s "$SESSION_NAME-llmctl"
tmux send-keys -t "$SESSION_NAME-llmctl" "$INIT_CMD && $LLMCTL_CMD" C-m
########################################################
# Processor
########################################################
PROCESSOR_CMD="RUST_LOG=info python3 -m kv_router.processor $MODEL_ARGS |& tee $LOGS_DIR/processor.log"
tmux new-session -d -s "$SESSION_NAME-processor"
tmux send-keys -t "$SESSION_NAME-processor" "$INIT_CMD && $PROCESSOR_CMD" C-m
########################################################
# Router
########################################################
ROUTER_CMD="RUST_LOG=info python3 -m kv_router.router \
--min-workers $NUM_WORKERS |& tee $LOGS_DIR/router.log"
tmux new-session -d -s "$SESSION_NAME-router"
tmux send-keys -t "$SESSION_NAME-router" "$INIT_CMD && $ROUTER_CMD" C-m
########################################################
# Workers
########################################################
WORKER_CMD="RUST_LOG=info python3 -m kv_router.worker $MODEL_ARGS"
for i in $(seq 1 $NUM_WORKERS); do
tmux new-session -d -s "$SESSION_NAME-$i"
done
for i in $(seq 1 $NUM_WORKERS); do
tmux send-keys -t "$SESSION_NAME-$i" "$INIT_CMD && CUDA_VISIBLE_DEVICES=$((i-1)) $WORKER_CMD |& tee $LOGS_DIR/worker-$i.log" C-m
done
\ No newline at end of file
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
## Overview
Pipeline Architecture:
```
Users/Clients (HTTP)
┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate)
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Middle │
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Backend │
└─────────────┘
```
## Unified serve
1. Launch all three services using a single command -
```bash
cd /workspace/examples/python_rs/llm/vllm
dynamo-sdk serve sdk_basic_service.basic:Frontend
```
2. Send request to frontend using curl -
```bash
curl -X 'POST' \
'http://localhost:3000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"text": "test"
}'
```
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pydantic import BaseModel
from dynamo.sdk import api, depends, dynamo_endpoint, service
"""
Pipeline Architecture:
Users/Clients (HTTP)
┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate)
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Middle │
└─────────────┘
│ dynamo/runtime
┌─────────────┐
│ Backend │
└─────────────┘
"""
class RequestType(BaseModel):
text: str
class ResponseType(BaseModel):
text: str
@service(
resources={"cpu": "2"},
traffic={"timeout": 30},
dynamo={
"enabled": True,
"namespace": "inference",
},
workers=3,
)
class Backend:
def __init__(self) -> None:
print("Starting backend")
@dynamo_endpoint()
async def generate(self, req: RequestType):
"""Generate tokens."""
req_text = req.text
print(f"Backend received: {req_text}")
text = f"{req_text}-back"
for token in text.split():
yield f"Backend: {token}"
@service(
resources={"cpu": "2"},
traffic={"timeout": 30},
dynamo={"enabled": True, "namespace": "inference"},
)
class Middle:
backend = depends(Backend)
def __init__(self) -> None:
print("Starting middle")
@dynamo_endpoint()
async def generate(self, req: RequestType):
"""Forward requests to backend."""
req_text = req.text
print(f"Middle received: {req_text}")
text = f"{req_text}-mid"
next_request = RequestType(text=text).model_dump_json()
async for response in self.backend.generate(next_request):
print(f"Middle received response: {response}")
yield f"Middle: {response}"
@service(resources={"cpu": "1"}, traffic={"timeout": 60}) # Regular HTTP API
class Frontend:
middle = depends(Middle)
def __init__(self) -> None:
print("Starting frontend")
@api
async def generate(self, text):
"""Stream results from the pipeline."""
print(f"Frontend received: {text}")
print(f"Frontend received type: {type(text)}")
txt = RequestType(text=text)
print(f"Frontend sending: {type(txt)}")
async for response in self.middle.generate(txt.model_dump_json()):
yield f"Frontend: {response}"
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
Run this example using command below
```bash
cd /workspace/examples/python_rs/llm/vllm
dynamo-sdk serve sdk_kv_router.frontend:Frontend
```
Send request to http service:
```bash
curl -X 'POST' \
'http://localhost:3000/chat_completion' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"msg": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}'
```
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sdk_kv_router.processor import Processor
from dynamo.sdk import DYNAMO_IMAGE, api, depends, service
@service(traffic={"timeout": 10000}, image=DYNAMO_IMAGE)
class Frontend:
processor = depends(Processor)
def __init__(self):
print("frontend init")
@api
async def chat_completion(self, msg: str):
# Call the generate method
generator = self.processor.generate(
{
"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"messages": [{"role": "user", "content": msg}],
"stream": True,
"max_tokens": 10,
}
)
# Now iterate over the async generator
async for response in generator:
print("client response_data:", response)
yield response
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import uuid
from typing import AsyncIterator
import bentoml
from sdk_kv_router.router import Router
from sdk_kv_router.worker import VllmEngine
with bentoml.importing():
from transformers import AutoTokenizer
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer
from common.chat_processor import ChatProcessor, ProcessMixIn
from common.protocol import MyRequestOutput, Tokens, vLLMGenerateRequest
from dynamo.sdk import depends, dynamo_context, dynamo_endpoint, service
@service(
dynamo={
"enabled": True,
"namespace": "dynamo",
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
)
class Processor(ProcessMixIn):
"""
vLLM pre and post processing
"""
workers = depends(VllmEngine)
router = depends(Router)
def __init__(self):
model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
self.engine_args = AsyncEngineArgs(
model=model,
tokenizer=model,
enable_prefix_caching=True,
block_size=64,
max_model_len=16384,
)
self.model_config = self.engine_args.create_model_config()
self.tokenizer = self._create_tokenizer()
self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
def _create_tokenizer(self) -> AnyTokenizer:
"""Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
model_path = self.engine_args.model
# Create the base tokenizer with VLLM's typical settings
base_tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
padding_side="left",
truncation_side="left",
use_fast=True, # VLLM might use the fast tokenizer for efficiency
)
return base_tokenizer
async def generate_responses(
self, engine_generator
) -> AsyncIterator[RequestOutput]:
async for resp in engine_generator:
# Deserialize the response from the engine
# Creates correct vLLM objects for each field
output = MyRequestOutput.model_validate_json(resp.data())
yield RequestOutput(
request_id=output.request_id,
prompt=output.prompt,
prompt_token_ids=output.prompt_token_ids,
prompt_logprobs=output.prompt_logprobs,
outputs=output.outputs,
finished=output.finished,
metrics=output.metrics,
)
@dynamo_endpoint()
async def generate(self, raw_request: ChatCompletionRequest):
request_id = str(uuid.uuid4())
(
request,
conversation,
prompt,
engine_prompt,
sampling_params,
) = await self._parse_raw_request(raw_request)
worker_id = None
async for worker in self.router.generate(
Tokens(tokens=engine_prompt["prompt_token_ids"]).model_dump_json()
):
worker_id = worker
break
runtime = dynamo_context["runtime"]
comp_ns, comp_name = VllmEngine.dynamo_address() # type: ignore
worker_client = (
await runtime.namespace(comp_ns)
.component(comp_name)
.endpoint("generate")
.client()
)
if worker_id == "":
engine_generator = await worker_client.generate(
vLLMGenerateRequest(
engine_prompt=engine_prompt,
sampling_params=sampling_params,
request_id=request_id,
).model_dump_json()
)
else:
engine_generator = await worker_client.direct(
vLLMGenerateRequest(
engine_prompt=engine_prompt,
sampling_params=sampling_params,
request_id=request_id,
).model_dump_json(),
uuid.UUID(worker_id).int,
)
output = self.generate_responses(engine_generator)
async for response in await self._stream_response(
request, output, request_id, conversation
):
yield response
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment