Commit 602352ce authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

chore: rename dynamo (#44)


Co-authored-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent ecf53ce2
...@@ -32,7 +32,7 @@ from tensorrt_llm.serve.openai_protocol import ( ...@@ -32,7 +32,7 @@ from tensorrt_llm.serve.openai_protocol import (
CompletionStreamResponse, CompletionStreamResponse,
) )
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
logger.set_level("debug") logger.set_level("debug")
...@@ -45,7 +45,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine): ...@@ -45,7 +45,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
def __init__(self, engine_config: LLMAPIConfig): def __init__(self, engine_config: LLMAPIConfig):
super().__init__(engine_config) super().__init__(engine_config)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse) @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, request): async def generate_chat(self, request):
if self._llm_engine is None: if self._llm_engine is None:
raise RuntimeError("Engine not initialized") raise RuntimeError("Engine not initialized")
...@@ -93,7 +93,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine): ...@@ -93,7 +93,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
except Exception as e: except Exception as e:
raise RuntimeError("Failed to generate: " + str(e)) raise RuntimeError("Failed to generate: " + str(e))
@dynemo_endpoint(CompletionRequest, CompletionStreamResponse) @dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completion(self, request): async def generate_completion(self, request):
if self._llm_engine is None: if self._llm_engine is None:
raise RuntimeError("Engine not initialized") raise RuntimeError("Engine not initialized")
...@@ -140,13 +140,13 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine): ...@@ -140,13 +140,13 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
raise RuntimeError("Failed to generate: " + str(e)) raise RuntimeError("Failed to generate: " + str(e))
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_config: LLMAPIConfig): async def worker(runtime: DistributedRuntime, engine_config: LLMAPIConfig):
""" """
Instantiate a `backend` component and serve the `generate` endpoint Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints A `Component` can serve multiple endpoints
""" """
component = runtime.namespace("dynemo").component("tensorrt-llm") component = runtime.namespace("dynamo").component("tensorrt-llm")
await component.create_service() await component.create_service()
completions_endpoint = component.endpoint("completions") completions_endpoint = component.endpoint("completions")
......
...@@ -63,7 +63,7 @@ By default the server will run on port 8080. ...@@ -63,7 +63,7 @@ By default the server will run on port 8080.
Add model to the server: Add model to the server:
```bash ```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.vllm.generate llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.generate
``` ```
##### Example Output ##### Example Output
...@@ -71,7 +71,7 @@ llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.vllm ...@@ -71,7 +71,7 @@ llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.vllm
+------------+------------------------------------------+-----------+-----------+----------+ +------------+------------------------------------------+-----------+-----------+----------+
| MODEL TYPE | MODEL NAME | NAMESPACE | COMPONENT | ENDPOINT | | MODEL TYPE | MODEL NAME | NAMESPACE | COMPONENT | ENDPOINT |
+------------+------------------------------------------+-----------+-----------+----------+ +------------+------------------------------------------+-----------+-----------+----------+
| chat | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | dynemo | vllm | generate | | chat | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | dynamo | vllm | generate |
+------------+------------------------------------------+-----------+-----------+----------+ +------------+------------------------------------------+-----------+-----------+----------+
``` ```
...@@ -83,7 +83,7 @@ In a separate terminal run the vllm worker: ...@@ -83,7 +83,7 @@ In a separate terminal run the vllm worker:
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
# Launch worker # Launch worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
...@@ -116,7 +116,7 @@ This deployment option splits the model serving across prefill and decode worker ...@@ -116,7 +116,7 @@ This deployment option splits the model serving across prefill and decode worker
**Terminal 1 - Prefill Worker:** **Terminal 1 - Prefill Worker:**
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
# Launch prefill worker # Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
...@@ -126,7 +126,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat ...@@ -126,7 +126,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 1 \ --tensor-parallel-size 1 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"DynemoNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' '{"kv_connector":"DynamoNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
``` ```
##### Example Output ##### Example Output
...@@ -142,7 +142,7 @@ INFO 03-02 05:59:47 llm_engine.py:476] init engine (profile, create kv cache, wa ...@@ -142,7 +142,7 @@ INFO 03-02 05:59:47 llm_engine.py:476] init engine (profile, create kv cache, wa
**Terminal 2 - Decode Worker:** **Terminal 2 - Decode Worker:**
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
# Launch decode worker # Launch decode worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
...@@ -152,7 +152,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg ...@@ -152,7 +152,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 2 \ --tensor-parallel-size 2 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"DynemoNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' '{"kv_connector":"DynamoNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
``` ```
The disaggregated deployment utilizes separate GPUs for prefill and decode operations, allowing for optimized resource allocation and improved performance. For more details on the disaggregated deployment, please refer to the [vLLM documentation](https://docs.vllm.ai/en/latest/features/disagg_prefill.html). The disaggregated deployment utilizes separate GPUs for prefill and decode operations, allowing for optimized resource allocation and improved performance. For more details on the disaggregated deployment, please refer to the [vLLM documentation](https://docs.vllm.ai/en/latest/features/disagg_prefill.html).
...@@ -254,7 +254,7 @@ tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {} ...@@ -254,7 +254,7 @@ tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {}
**Terminal 1 - Router:** **Terminal 1 - Router:**
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
# Launch prefill worker # Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
...@@ -270,7 +270,7 @@ You can choose only the prefix strategy for now: ...@@ -270,7 +270,7 @@ You can choose only the prefix strategy for now:
**Terminal 2 - Processor:** **Terminal 2 - Processor:**
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
# Processor must take the same args as the worker # Processor must take the same args as the worker
# This is temporary until we communicate the ModelDeploymentCard over etcd # This is temporary until we communicate the ModelDeploymentCard over etcd
...@@ -286,7 +286,7 @@ RUST_LOG=info python3 -m kv_router.processor \ ...@@ -286,7 +286,7 @@ RUST_LOG=info python3 -m kv_router.processor \
**Terminal 3 and 4 - Workers:** **Terminal 3 and 4 - Workers:**
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
# Launch Worker 1 and Worker 2 with same command # Launch Worker 1 and Worker 2 with same command
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
...@@ -304,7 +304,7 @@ Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens ...@@ -304,7 +304,7 @@ Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens
**Terminal 5 - Client:** **Terminal 5 - Client:**
Don't forget to add the model to the server: Don't forget to add the model to the server:
```bash ```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.process.chat/completions llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.process.chat/completions
``` ```
```bash ```bash
...@@ -351,7 +351,7 @@ Run following commands in 4 terminals: ...@@ -351,7 +351,7 @@ Run following commands in 4 terminals:
**Terminal 1 - vLLM Worker:** **Terminal 1 - vLLM Worker:**
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
...@@ -361,7 +361,7 @@ RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Dis ...@@ -361,7 +361,7 @@ RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Dis
```bash ```bash
# Activate virtual environment # Activate virtual environment
source /opt/dynemo/venv/bin/activate source /opt/dynamo/venv/bin/activate
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m preprocessor.processor --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B RUST_LOG=info python3 -m preprocessor.processor --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
...@@ -377,7 +377,7 @@ By default the server will run on port 8080. ...@@ -377,7 +377,7 @@ By default the server will run on port 8080.
Add model to the server: Add model to the server:
```bash ```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B dynemo.preprocessor.generate llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B dynamo.preprocessor.generate
``` ```
**Terminal 4 - client** **Terminal 4 - client**
......
...@@ -19,12 +19,12 @@ import asyncio ...@@ -19,12 +19,12 @@ import asyncio
import uvloop import uvloop
from dynemo.runtime import DistributedRuntime, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
from .protocol import Request from .protocol import Request
@dynemo_worker() @dynamo_worker()
async def worker( async def worker(
runtime: DistributedRuntime, runtime: DistributedRuntime,
component: str, component: str,
...@@ -36,7 +36,7 @@ async def worker( ...@@ -36,7 +36,7 @@ async def worker(
Instantiate a `backend` client and call the `generate` endpoint Instantiate a `backend` client and call the `generate` endpoint
""" """
# get endpoint # get endpoint
endpoint = runtime.namespace("dynemo").component(component).endpoint("generate") endpoint = runtime.namespace("dynamo").component(component).endpoint("generate")
# create client # create client
client = await endpoint.client() client = await endpoint.client()
......
...@@ -31,7 +31,7 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -31,7 +31,7 @@ from vllm.entrypoints.openai.protocol import (
) )
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn): class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
...@@ -54,7 +54,7 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn): ...@@ -54,7 +54,7 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
self.kv_rank = self.kv_transfer_config.kv_rank self.kv_rank = self.kv_transfer_config.kv_rank
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse) @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, raw_request): async def generate(self, raw_request):
if self.engine_client is None: if self.engine_client is None:
await self.initialize() await self.initialize()
...@@ -103,17 +103,17 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn): ...@@ -103,17 +103,17 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
await prefill_output await prefill_output
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
""" """
Instantiate a `backend` component and serve the `generate` endpoint Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints A `Component` can serve multiple endpoints
""" """
component = runtime.namespace("dynemo").component("vllm") component = runtime.namespace("dynamo").component("vllm")
await component.create_service() await component.create_service()
prefill = ( prefill = (
await runtime.namespace("dynemo") await runtime.namespace("dynamo")
.component("prefill") .component("prefill")
.endpoint("generate") .endpoint("generate")
.client() .client()
......
...@@ -24,7 +24,7 @@ from common.protocol import PrefillRequest, PrefillResponse ...@@ -24,7 +24,7 @@ from common.protocol import PrefillRequest, PrefillResponse
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmPrefillEngine(BaseVllmEngine): class VllmPrefillEngine(BaseVllmEngine):
...@@ -45,7 +45,7 @@ class VllmPrefillEngine(BaseVllmEngine): ...@@ -45,7 +45,7 @@ class VllmPrefillEngine(BaseVllmEngine):
self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
self.kv_rank = self.kv_transfer_config.kv_rank self.kv_rank = self.kv_transfer_config.kv_rank
@dynemo_endpoint(PrefillRequest, PrefillResponse) @dynamo_endpoint(PrefillRequest, PrefillResponse)
async def generate(self, request): async def generate(self, request):
if self.engine_client is None: if self.engine_client is None:
await self.initialize() await self.initialize()
...@@ -62,13 +62,13 @@ class VllmPrefillEngine(BaseVllmEngine): ...@@ -62,13 +62,13 @@ class VllmPrefillEngine(BaseVllmEngine):
yield True yield True
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
""" """
Instantiate a `backend` component and serve the `generate` endpoint Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints A `Component` can serve multiple endpoints
""" """
component = runtime.namespace("dynemo").component("prefill") component = runtime.namespace("dynamo").component("prefill")
await component.create_service() await component.create_service()
async with VllmPrefillEngine(engine_args) as prefill_engine: async with VllmPrefillEngine(engine_args) as prefill_engine:
......
...@@ -34,7 +34,7 @@ from vllm.logger import logger as vllm_logger ...@@ -34,7 +34,7 @@ from vllm.logger import logger as vllm_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from dynemo.runtime import Client, DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import Client, DistributedRuntime, dynamo_endpoint, dynamo_worker
class RequestType(Enum): class RequestType(Enum):
...@@ -157,38 +157,38 @@ class Processor(ProcessMixIn): ...@@ -157,38 +157,38 @@ class Processor(ProcessMixIn):
f"Request type {request_type} not implemented" f"Request type {request_type} not implemented"
) )
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse) @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, raw_request): async def generate_chat(self, raw_request):
async for response in self._generate(raw_request, RequestType.CHAT): async for response in self._generate(raw_request, RequestType.CHAT):
yield response yield response
@dynemo_endpoint(CompletionRequest, CompletionStreamResponse) @dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completions(self, raw_request): async def generate_completions(self, raw_request):
async for response in self._generate(raw_request, RequestType.COMPLETION): async for response in self._generate(raw_request, RequestType.COMPLETION):
yield response yield response
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
""" """
Set up clients to the router and workers. Set up clients to the router and workers.
Serve the dynemo.process.chat/completions endpoint. Serve the dynamo.process.chat/completions endpoint.
""" """
workers_client = ( workers_client = (
await runtime.namespace("dynemo") await runtime.namespace("dynamo")
.component("vllm") .component("vllm")
.endpoint("generate") .endpoint("generate")
.client() .client()
) )
router_client = ( router_client = (
await runtime.namespace("dynemo") await runtime.namespace("dynamo")
.component("router") .component("router")
.endpoint("generate") .endpoint("generate")
.client() .client()
) )
preprocess_component = runtime.namespace("dynemo").component("process") preprocess_component = runtime.namespace("dynamo").component("process")
await preprocess_component.create_service() await preprocess_component.create_service()
chat_endpoint = preprocess_component.endpoint("chat/completions") chat_endpoint = preprocess_component.endpoint("chat/completions")
......
...@@ -23,8 +23,8 @@ import uvloop ...@@ -23,8 +23,8 @@ import uvloop
from common.protocol import Tokens from common.protocol import Tokens
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from dynemo.llm import KvIndexer, KvMetricsAggregator, KvRouter from dynamo.llm import KvIndexer, KvMetricsAggregator, KvRouter
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
WorkerId = str WorkerId = str
...@@ -51,7 +51,7 @@ class Router: ...@@ -51,7 +51,7 @@ class Router:
self.router = router self.router = router
self.routing_strategy = routing_strategy self.routing_strategy = routing_strategy
@dynemo_endpoint(Tokens, WorkerId) @dynamo_endpoint(Tokens, WorkerId)
async def generate(self, request) -> AsyncIterator[WorkerId]: async def generate(self, request) -> AsyncIterator[WorkerId]:
lora_id = 0 lora_id = 0
worker_id = None worker_id = None
...@@ -108,7 +108,7 @@ class CustomRouter: ...@@ -108,7 +108,7 @@ class CustomRouter:
) )
return current_best[0] return current_best[0]
@dynemo_endpoint(Tokens, WorkerId) @dynamo_endpoint(Tokens, WorkerId)
async def generate(self, request) -> AsyncIterator[WorkerId]: async def generate(self, request) -> AsyncIterator[WorkerId]:
lora_id = 0 lora_id = 0
worker_id = "" worker_id = ""
...@@ -132,14 +132,14 @@ class CustomRouter: ...@@ -132,14 +132,14 @@ class CustomRouter:
yield str(worker_id) yield str(worker_id)
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, args: Namespace): async def worker(runtime: DistributedRuntime, args: Namespace):
""" """
Set up the worker clients. Set up the worker clients.
Serve the dynemo.router.generate endpoint. Serve the dynamo.router.generate endpoint.
""" """
workers_client = ( workers_client = (
await runtime.namespace("dynemo") await runtime.namespace("dynamo")
.component("vllm") .component("vllm")
.endpoint("generate") .endpoint("generate")
.client() .client()
...@@ -164,10 +164,10 @@ async def worker(runtime: DistributedRuntime, args: Namespace): ...@@ -164,10 +164,10 @@ async def worker(runtime: DistributedRuntime, args: Namespace):
+ "\n".join(f"id: {id}" for id in workers_client.endpoint_ids()) + "\n".join(f"id: {id}" for id in workers_client.endpoint_ids())
) )
kv_listener = runtime.namespace("dynemo").component("vllm") kv_listener = runtime.namespace("dynamo").component("vllm")
await kv_listener.create_service() await kv_listener.create_service()
router_component = runtime.namespace("dynemo").component("router") router_component = runtime.namespace("dynamo").component("router")
await router_component.create_service() await router_component.create_service()
endpoint = router_component.endpoint("generate") endpoint = router_component.endpoint("generate")
......
...@@ -25,8 +25,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -25,8 +25,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from dynemo.llm import KvMetricsPublisher from dynamo.llm import KvMetricsPublisher
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
vllm_logger.info(f"VLLM_KV_CAPI_PATH: {os.environ['VLLM_KV_CAPI_PATH']}") vllm_logger.info(f"VLLM_KV_CAPI_PATH: {os.environ['VLLM_KV_CAPI_PATH']}")
...@@ -48,7 +48,7 @@ class VllmEngine(BaseVllmEngine): ...@@ -48,7 +48,7 @@ class VllmEngine(BaseVllmEngine):
assert self.engine_client is not None, "engine_client was not initialized" assert self.engine_client is not None, "engine_client was not initialized"
self.engine_client.set_metrics_publisher(self.metrics_publisher) self.engine_client.set_metrics_publisher(self.metrics_publisher)
@dynemo_endpoint(vLLMGenerateRequest, MyRequestOutput) @dynamo_endpoint(vLLMGenerateRequest, MyRequestOutput)
async def generate(self, request) -> AsyncIterator: async def generate(self, request) -> AsyncIterator:
assert ( assert (
self.engine_client is not None self.engine_client is not None
...@@ -73,12 +73,12 @@ class VllmEngine(BaseVllmEngine): ...@@ -73,12 +73,12 @@ class VllmEngine(BaseVllmEngine):
).model_dump_json() ).model_dump_json()
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
""" """
Serve the dynemo.vllm.generate endpoint. Serve the dynamo.vllm.generate endpoint.
""" """
worker_component = runtime.namespace("dynemo").component("vllm") worker_component = runtime.namespace("dynamo").component("vllm")
await worker_component.create_service() await worker_component.create_service()
worker_endpoint = worker_component.endpoint("generate") worker_endpoint = worker_component.endpoint("generate")
...@@ -87,7 +87,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): ...@@ -87,7 +87,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID) os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}") vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
VLLM_KV_NAMESPACE = "dynemo" VLLM_KV_NAMESPACE = "dynamo"
os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE) os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
VLLM_KV_COMPONENT = "vllm" VLLM_KV_COMPONENT = "vllm"
......
...@@ -28,7 +28,7 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -28,7 +28,7 @@ from vllm.entrypoints.openai.protocol import (
) )
from vllm.logger import logger as vllm_logger from vllm.logger import logger as vllm_logger
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class VllmEngine(BaseVllmEngine, ProcessMixIn): class VllmEngine(BaseVllmEngine, ProcessMixIn):
...@@ -39,7 +39,7 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn): ...@@ -39,7 +39,7 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn):
def __init__(self, engine_args: AsyncEngineArgs): def __init__(self, engine_args: AsyncEngineArgs):
super().__init__(engine_args) super().__init__(engine_args)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse) @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, raw_request): async def generate(self, raw_request):
if self.engine_client is None: if self.engine_client is None:
await self.initialize() await self.initialize()
...@@ -71,13 +71,13 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn): ...@@ -71,13 +71,13 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn):
yield response yield response
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
""" """
Instantiate a `backend` component and serve the `generate` endpoint Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints A `Component` can serve multiple endpoints
""" """
component = runtime.namespace("dynemo").component("vllm") component = runtime.namespace("dynamo").component("vllm")
await component.create_service() await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
......
...@@ -19,27 +19,27 @@ import asyncio ...@@ -19,27 +19,27 @@ import asyncio
import uvloop import uvloop
from preprocessor.common import parse_vllm_args from preprocessor.common import parse_vllm_args
from dynemo.runtime import ( from dynamo.runtime import (
DistributedRuntime, DistributedRuntime,
ModelDeploymentCard, ModelDeploymentCard,
OAIChatPreprocessor, OAIChatPreprocessor,
dynemo_worker, dynamo_worker,
) )
uvloop.install() uvloop.install()
@dynemo_worker() @dynamo_worker()
async def preprocessor(runtime: DistributedRuntime, model_name: str, model_path: str): async def preprocessor(runtime: DistributedRuntime, model_name: str, model_path: str):
# create model deployment card # create model deployment card
mdc = await ModelDeploymentCard.from_local_path(model_path, model_name) mdc = await ModelDeploymentCard.from_local_path(model_path, model_name)
# create preprocessor endpoint # create preprocessor endpoint
component = runtime.namespace("dynemo").component("preprocessor") component = runtime.namespace("dynamo").component("preprocessor")
await component.create_service() await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
# create backend endpoint # create backend endpoint
backend = runtime.namespace("dynemo").component("backend").endpoint("generate") backend = runtime.namespace("dynamo").component("backend").endpoint("generate")
# start preprocessor service with next backend # start preprocessor service with next backend
chat = OAIChatPreprocessor(mdc, endpoint, next=backend) chat = OAIChatPreprocessor(mdc, endpoint, next=backend)
......
...@@ -28,12 +28,12 @@ from vllm.entrypoints.openai.api_server import ( ...@@ -28,12 +28,12 @@ from vllm.entrypoints.openai.api_server import (
) )
from vllm.outputs import CompletionOutput from vllm.outputs import CompletionOutput
from dynemo.runtime import ( from dynamo.runtime import (
Backend, Backend,
DistributedRuntime, DistributedRuntime,
ModelDeploymentCard, ModelDeploymentCard,
dynemo_endpoint, dynamo_endpoint,
dynemo_worker, dynamo_worker,
) )
finish_reason_map = { finish_reason_map = {
...@@ -107,7 +107,7 @@ class VllmEngine(AsyncContextDecorator): ...@@ -107,7 +107,7 @@ class VllmEngine(AsyncContextDecorator):
} }
return SamplingParams(**sampling_params) return SamplingParams(**sampling_params)
@dynemo_endpoint(Any, CompletionOutput) @dynamo_endpoint(Any, CompletionOutput)
async def generate(self, request): async def generate(self, request):
state = DeltaState() state = DeltaState()
request_id = str(uuid.uuid4()) request_id = str(uuid.uuid4())
...@@ -122,13 +122,13 @@ class VllmEngine(AsyncContextDecorator): ...@@ -122,13 +122,13 @@ class VllmEngine(AsyncContextDecorator):
yield self.to_backend_output(choice, delta_token_ids) yield self.to_backend_output(choice, delta_token_ids)
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: NvAsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: NvAsyncEngineArgs):
""" """
Instantiate a `backend` component and serve the `generate` endpoint Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints A `Component` can serve multiple endpoints
""" """
component = runtime.namespace("dynemo").component("backend") component = runtime.namespace("dynamo").component("backend")
await component.create_service() await component.create_service()
endpoint = component.endpoint("generate") endpoint = component.endpoint("generate")
......
...@@ -21,12 +21,12 @@ ...@@ -21,12 +21,12 @@
if [ $# -gt 2 ]; then if [ $# -gt 2 ]; then
echo "Usage: $0 [model_name] [endpoint_name]" echo "Usage: $0 [model_name] [endpoint_name]"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)" echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: dynemo.vllm.generate)" echo "Optional: endpoint_name (default: dynamo.vllm.generate)"
exit 1 exit 1
fi fi
MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${2:-"dynemo.vllm.generate"} ENDPOINT_NAME=${2:-"dynamo.vllm.generate"}
SESSION_NAME="vllm_disagg" SESSION_NAME="vllm_disagg"
WORKDIR="$(dirname $0)/.." WORKDIR="$(dirname $0)/.."
INIT_CMD="cd $WORKDIR" INIT_CMD="cd $WORKDIR"
......
...@@ -22,14 +22,14 @@ if [ $# -lt 2 ]; then ...@@ -22,14 +22,14 @@ if [ $# -lt 2 ]; then
echo "Usage: $0 <number_of_workers> <routing_strategy> [model_name] [endpoint_name]" echo "Usage: $0 <number_of_workers> <routing_strategy> [model_name] [endpoint_name]"
echo "Error: Must specify at least number of workers and routing strategy" echo "Error: Must specify at least number of workers and routing strategy"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)" echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: dynemo.process.chat/completions)" echo "Optional: endpoint_name (default: dynamo.process.chat/completions)"
exit 1 exit 1
fi fi
NUM_WORKERS=$1 NUM_WORKERS=$1
ROUTING_STRATEGY=$2 ROUTING_STRATEGY=$2
MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${4:-"dynemo.process.chat/completions"} ENDPOINT_NAME=${4:-"dynamo.process.chat/completions"}
VALID_STRATEGIES=("prefix") VALID_STRATEGIES=("prefix")
SESSION_NAME="v" SESSION_NAME="v"
WORKDIR="/workspace/examples/python_rs/llm/vllm" WORKDIR="/workspace/examples/python_rs/llm/vllm"
......
...@@ -26,12 +26,12 @@ Users/Clients (HTTP) ...@@ -26,12 +26,12 @@ Users/Clients (HTTP)
┌─────────────┐ ┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate) │ Frontend │ HTTP API endpoint (/generate)
└─────────────┘ └─────────────┘
│ dynemo/distributed-runtime │ dynamo/runtime
┌─────────────┐ ┌─────────────┐
│ Middle │ │ Middle │
└─────────────┘ └─────────────┘
│ dynemo/distributed-runtime │ dynamo/runtime
┌─────────────┐ ┌─────────────┐
│ Backend │ │ Backend │
...@@ -45,7 +45,7 @@ Users/Clients (HTTP) ...@@ -45,7 +45,7 @@ Users/Clients (HTTP)
```bash ```bash
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
dynemo-sdk serve sdk_basic_service.basic:Frontend dynamo-sdk serve sdk_basic_service.basic:Frontend
``` ```
2. Send request to frontend using curl - 2. Send request to frontend using curl -
...@@ -58,4 +58,4 @@ curl -X 'POST' \ ...@@ -58,4 +58,4 @@ curl -X 'POST' \
-d '{ -d '{
"text": "test" "text": "test"
}' }'
``` ```
\ No newline at end of file
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from pydantic import BaseModel from pydantic import BaseModel
from dynemo.sdk import api, depends, dynemo_endpoint, service from dynamo.sdk import api, depends, dynamo_endpoint, service
""" """
Pipeline Architecture: Pipeline Architecture:
...@@ -26,12 +26,12 @@ Users/Clients (HTTP) ...@@ -26,12 +26,12 @@ Users/Clients (HTTP)
┌─────────────┐ ┌─────────────┐
│ Frontend │ HTTP API endpoint (/generate) │ Frontend │ HTTP API endpoint (/generate)
└─────────────┘ └─────────────┘
│ dynemo/distributed-runtime │ dynamo/runtime
┌─────────────┐ ┌─────────────┐
│ Middle │ │ Middle │
└─────────────┘ └─────────────┘
│ dynemo/distributed-runtime │ dynamo/runtime
┌─────────────┐ ┌─────────────┐
│ Backend │ │ Backend │
...@@ -50,7 +50,7 @@ class ResponseType(BaseModel): ...@@ -50,7 +50,7 @@ class ResponseType(BaseModel):
@service( @service(
resources={"cpu": "2"}, resources={"cpu": "2"},
traffic={"timeout": 30}, traffic={"timeout": 30},
dynemo={ dynamo={
"enabled": True, "enabled": True,
"namespace": "inference", "namespace": "inference",
}, },
...@@ -60,9 +60,10 @@ class Backend: ...@@ -60,9 +60,10 @@ class Backend:
def __init__(self) -> None: def __init__(self) -> None:
print("Starting backend") print("Starting backend")
@dynemo_endpoint() @dynamo_endpoint()
async def generate(self, req: RequestType): async def generate(self, req: RequestType):
"""Generate tokens.""" """Generate tokens."""
print("here2")
req_text = req.text req_text = req.text
print(f"Backend received: {req_text}") print(f"Backend received: {req_text}")
text = f"{req_text}-back" text = f"{req_text}-back"
...@@ -73,7 +74,7 @@ class Backend: ...@@ -73,7 +74,7 @@ class Backend:
@service( @service(
resources={"cpu": "2"}, resources={"cpu": "2"},
traffic={"timeout": 30}, traffic={"timeout": 30},
dynemo={"enabled": True, "namespace": "inference"}, dynamo={"enabled": True, "namespace": "inference"},
) )
class Middle: class Middle:
backend = depends(Backend) backend = depends(Backend)
...@@ -81,13 +82,14 @@ class Middle: ...@@ -81,13 +82,14 @@ class Middle:
def __init__(self) -> None: def __init__(self) -> None:
print("Starting middle") print("Starting middle")
@dynemo_endpoint() @dynamo_endpoint()
async def generate(self, req: RequestType): async def generate(self, req: RequestType):
"""Forward requests to backend.""" """Forward requests to backend."""
req_text = req.text req_text = req.text
print(f"Middle received: {req_text}") print(f"Middle received: {req_text}")
text = f"{req_text}-mid" text = f"{req_text}-mid"
next_request = RequestType(text=text).model_dump_json() next_request = RequestType(text=text).model_dump_json()
print("here5")
async for response in self.backend.generate(next_request): async for response in self.backend.generate(next_request):
print(f"Middle received response: {response}") print(f"Middle received response: {response}")
yield f"Middle: {response}" yield f"Middle: {response}"
......
...@@ -33,7 +33,7 @@ All of the commands below are run inside the same container. ...@@ -33,7 +33,7 @@ All of the commands below are run inside the same container.
## Run deployment ## Run deployment
Add model to dynemo and start http server. Add model to dynamo and start http server.
In terminal 0: In terminal 0:
``` ```
...@@ -65,7 +65,7 @@ CUDA_VISIBLE_DEVICES=0 python prefill_worker.py \ ...@@ -65,7 +65,7 @@ CUDA_VISIBLE_DEVICES=0 python prefill_worker.py \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--enforce-eager \ --enforce-eager \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"DynemoNixlConnector"}' '{"kv_connector":"DynamoNixlConnector"}'
``` ```
In terminal 2: In terminal 2:
...@@ -78,7 +78,7 @@ CUDA_VISIBLE_DEVICES=1,2 python3 worker.py \ ...@@ -78,7 +78,7 @@ CUDA_VISIBLE_DEVICES=1,2 python3 worker.py \
--enforce-eager \ --enforce-eager \
--tensor-parallel-size 2 \ --tensor-parallel-size 2 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"DynemoNixlConnector"}' '{"kv_connector":"DynamoNixlConnector"}'
``` ```
...@@ -157,7 +157,7 @@ rm -r /tmp/nixl ...@@ -157,7 +157,7 @@ rm -r /tmp/nixl
- [x] Zero copy - [x] Zero copy
- [x] Conditional remote prefill - [x] Conditional remote prefill
- [x] Manual example with tp > 1 - [x] Manual example with tp > 1
- [x] Run on dynemo distributed runtime - [x] Run on dynamo distributed runtime
- [x] add oai http endpoint - [x] add oai http endpoint
- [x] Sample only on decode, do note return remote prefill response - [x] Sample only on decode, do note return remote prefill response
- [x] Check if all transfers finished before moving to decode - [x] Check if all transfers finished before moving to decode
......
...@@ -22,7 +22,7 @@ from vllm.distributed.device_communicators.nixl import NixlMetadata ...@@ -22,7 +22,7 @@ from vllm.distributed.device_communicators.nixl import NixlMetadata
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from dynemo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
METADATA_DIR = "/tmp/nixl" METADATA_DIR = "/tmp/nixl"
......
...@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.api_server import ( ...@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.api_server import (
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynemo.runtime import DistributedRuntime, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
class RequestHandler: class RequestHandler:
...@@ -71,7 +71,7 @@ class RequestHandler: ...@@ -71,7 +71,7 @@ class RequestHandler:
yield yield
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
component = runtime.namespace("test-nixl").component("prefill") component = runtime.namespace("test-nixl").component("prefill")
await component.create_service() await component.create_service()
......
...@@ -33,7 +33,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat ...@@ -33,7 +33,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
class RequestHandler: class RequestHandler:
...@@ -83,7 +83,7 @@ class RequestHandler: ...@@ -83,7 +83,7 @@ class RequestHandler:
return callback return callback
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse) @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, request): async def generate(self, request):
if not self.initialized: if not self.initialized:
await self.init() await self.init()
...@@ -109,7 +109,7 @@ class RequestHandler: ...@@ -109,7 +109,7 @@ class RequestHandler:
yield response yield response
@dynemo_worker() @dynamo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
component = runtime.namespace("test-nixl").component("vllm") component = runtime.namespace("test-nixl").component("vllm")
await component.create_service() await component.create_service()
......
...@@ -956,7 +956,7 @@ dependencies = [ ...@@ -956,7 +956,7 @@ dependencies = [
] ]
[[package]] [[package]]
name = "dynemo-llm" name = "dynamo-llm"
version = "0.2.1" version = "0.2.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
...@@ -971,7 +971,7 @@ dependencies = [ ...@@ -971,7 +971,7 @@ dependencies = [
"chrono", "chrono",
"cmake", "cmake",
"derive_builder", "derive_builder",
"dynemo-runtime", "dynamo-runtime",
"either", "either",
"erased-serde", "erased-serde",
"futures", "futures",
...@@ -1005,7 +1005,7 @@ dependencies = [ ...@@ -1005,7 +1005,7 @@ dependencies = [
] ]
[[package]] [[package]]
name = "dynemo-runtime" name = "dynamo-runtime"
version = "0.2.1" version = "0.2.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
...@@ -1463,7 +1463,7 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" ...@@ -1463,7 +1463,7 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
name = "hello_world" name = "hello_world"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"dynemo-runtime", "dynamo-runtime",
] ]
[[package]] [[package]]
...@@ -1488,8 +1488,8 @@ name = "http" ...@@ -1488,8 +1488,8 @@ name = "http"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"clap", "clap",
"dynemo-llm", "dynamo-llm",
"dynemo-runtime", "dynamo-runtime",
"serde", "serde",
"serde_json", "serde_json",
"tokio", "tokio",
...@@ -1988,8 +1988,8 @@ name = "llmctl" ...@@ -1988,8 +1988,8 @@ name = "llmctl"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"clap", "clap",
"dynemo-llm", "dynamo-llm",
"dynemo-runtime", "dynamo-runtime",
"serde", "serde",
"serde_json", "serde_json",
"tabled", "tabled",
...@@ -3390,7 +3390,7 @@ dependencies = [ ...@@ -3390,7 +3390,7 @@ dependencies = [
name = "service_metrics" name = "service_metrics"
version = "0.2.0" version = "0.2.0"
dependencies = [ dependencies = [
"dynemo-runtime", "dynamo-runtime",
"futures", "futures",
"serde", "serde",
"serde_json", "serde_json",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment