Commit 1af7433b authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: rename triton_distributed to dynemo (#22)


Co-authored-by: default avatarGraham King <grahamk@nvidia.com>
parent ee4ef06b
......@@ -26,11 +26,7 @@ from common.protocol import (
from tensorrt_llm.logger import logger
from tensorrt_llm.serve.openai_protocol import CompletionRequest, DisaggregatedParams
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
logger.set_level("debug")
......@@ -73,7 +69,7 @@ class Router:
# Disagg params should be in under the choices field in the response object.
# This is the case for completions but not for chat.
@triton_endpoint(CompletionRequest, DisaggCompletionStreamResponse)
@dynemo_endpoint(CompletionRequest, DisaggCompletionStreamResponse)
async def generate_completion(self, request):
# These settings are needed to satisfy request checks.
request.skip_special_tokens = False
......@@ -106,7 +102,7 @@ class Router:
)
yield json.loads(gen_resp_obj.model_dump_json(exclude_unset=True))
@triton_endpoint(DisaggChatCompletionRequest, DisaggChatCompletionStreamResponse)
@dynemo_endpoint(DisaggChatCompletionRequest, DisaggChatCompletionStreamResponse)
async def generate_chat(self, request):
# These settings are needed to satisfy request checks.
request.skip_special_tokens = False
......@@ -140,35 +136,35 @@ class Router:
yield json.loads(gen_resp_obj.model_dump_json(exclude_unset=True))
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("router")
component = runtime.namespace("dynemo").component("router")
await component.create_service()
ctx_completion_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("tensorrt-llm-ctx")
.endpoint("completions")
.client()
)
gen_completion_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("tensorrt-llm-gen")
.endpoint("completions")
.client()
)
ctx_chat_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("tensorrt-llm-ctx")
.endpoint("chat/completions")
.client()
)
gen_chat_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("tensorrt-llm-gen")
.endpoint("chat/completions")
.client()
......
......@@ -44,11 +44,7 @@ from tensorrt_llm.llmapi.disagg_utils import (
from tensorrt_llm.logger import logger
from tensorrt_llm.serve.openai_protocol import CompletionRequest
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
logger.set_level("debug")
......@@ -89,7 +85,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
engine_config.extra_args["_mpi_session"] = self._mpi_session
super().__init__(engine_config)
@triton_endpoint(DisaggChatCompletionRequest, DisaggChatCompletionStreamResponse)
@dynemo_endpoint(DisaggChatCompletionRequest, DisaggChatCompletionStreamResponse)
async def generate_chat(self, request):
if self._llm_engine is None:
raise RuntimeError("Engine not initialized")
......@@ -168,7 +164,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
self._ongoing_request_count -= 1
@triton_endpoint(CompletionRequest, DisaggCompletionStreamResponse)
@dynemo_endpoint(CompletionRequest, DisaggCompletionStreamResponse)
async def generate_completions(self, request):
if self._llm_engine is None:
raise RuntimeError("Engine not initialized")
......@@ -215,7 +211,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
self._ongoing_request_count -= 1
@triton_worker()
@dynemo_worker()
async def worker(
runtime: DistributedRuntime,
engine_config: LLMAPIConfig,
......@@ -230,9 +226,7 @@ async def worker(
server_type = disagg_config.server_configs[instance_idx].type
logger.info(f"Starting {server_type} server")
component = runtime.namespace("triton-init").component(
f"tensorrt-llm-{server_type}"
)
component = runtime.namespace("dynemo").component(f"tensorrt-llm-{server_type}")
await component.create_service()
completions_endpoint = component.endpoint("completions")
......
......@@ -32,11 +32,7 @@ from tensorrt_llm.serve.openai_protocol import (
CompletionStreamResponse,
)
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
logger.set_level("debug")
......@@ -49,7 +45,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
def __init__(self, engine_config: LLMAPIConfig):
super().__init__(engine_config)
@triton_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, request):
if self._llm_engine is None:
raise RuntimeError("Engine not initialized")
......@@ -97,7 +93,7 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
except Exception as e:
raise RuntimeError("Failed to generate: " + str(e))
@triton_endpoint(CompletionRequest, CompletionStreamResponse)
@dynemo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completion(self, request):
if self._llm_engine is None:
raise RuntimeError("Engine not initialized")
......@@ -144,13 +140,13 @@ class TensorrtLLMEngine(BaseTensorrtLLMEngine):
raise RuntimeError("Failed to generate: " + str(e))
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_config: LLMAPIConfig):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("tensorrt-llm")
component = runtime.namespace("dynemo").component("tensorrt-llm")
await component.create_service()
completions_endpoint = component.endpoint("completions")
......
......@@ -57,13 +57,13 @@ The example is designed to run in a containerized environment using Triton Distr
Run the server logging (with debug level logging):
```bash
TRD_LOG=DEBUG http
DYN_LOG=DEBUG http
```
By default the server will run on port 8080.
Add model to the server:
```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B triton-init.vllm.generate
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.vllm.generate
```
##### Example Output
......@@ -71,7 +71,7 @@ llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B triton-init
+------------+------------------------------------------+-----------+-----------+----------+
| MODEL TYPE | MODEL NAME | NAMESPACE | COMPONENT | ENDPOINT |
+------------+------------------------------------------+-----------+-----------+----------+
| chat | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | triton-init | vllm | generate |
| chat | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | dynemo | vllm | generate |
+------------+------------------------------------------+-----------+-----------+----------+
```
......@@ -83,7 +83,7 @@ In a separate terminal run the vllm worker:
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
# Launch worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -116,7 +116,7 @@ This deployment option splits the model serving across prefill and decode worker
**Terminal 1 - Prefill Worker:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
# Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -126,7 +126,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat
--enforce-eager \
--tensor-parallel-size 1 \
--kv-transfer-config \
'{"kv_connector":"TritonNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
'{"kv_connector":"DynemoNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
```
##### Example Output
......@@ -142,7 +142,7 @@ INFO 03-02 05:59:47 llm_engine.py:476] init engine (profile, create kv cache, wa
**Terminal 2 - Decode Worker:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
# Launch decode worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -152,7 +152,7 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg
--enforce-eager \
--tensor-parallel-size 2 \
--kv-transfer-config \
'{"kv_connector":"TritonNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
'{"kv_connector":"DynemoNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
```
The disaggregated deployment utilizes separate GPUs for prefill and decode operations, allowing for optimized resource allocation and improved performance. For more details on the disaggregated deployment, please refer to the [vLLM documentation](https://docs.vllm.ai/en/latest/features/disagg_prefill.html).
......@@ -254,7 +254,7 @@ tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {}
**Terminal 1 - Router:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
# Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm
......@@ -270,7 +270,7 @@ You can choose only the prefix strategy for now:
**Terminal 2 - Processor:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
# Processor must take the same args as the worker
# This is temporary until we communicate the ModelDeploymentCard over etcd
......@@ -286,7 +286,7 @@ RUST_LOG=info python3 -m kv_router.processor \
**Terminal 3 and 4 - Workers:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
# Launch Worker 1 and Worker 2 with same command
cd /workspace/examples/python_rs/llm/vllm
......@@ -304,7 +304,7 @@ Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens
**Terminal 5 - Client:**
Don't forget to add the model to the server:
```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B triton-init.process.chat/completions
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynemo.process.chat/completions
```
```bash
......@@ -351,7 +351,7 @@ Run following commands in 4 terminals:
**Terminal 1 - vLLM Worker:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
......@@ -361,7 +361,7 @@ RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Dis
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
source /opt/dynemo/venv/bin/activate
cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m preprocessor.processor --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
......@@ -371,13 +371,13 @@ RUST_LOG=info python3 -m preprocessor.processor --model deepseek-ai/DeepSeek-R1-
Run the server logging (with debug level logging):
```bash
TRD_LOG=DEBUG http
DYN_LOG=DEBUG http
```
By default the server will run on port 8080.
Add model to the server:
```bash
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B triton-init.preprocessor.generate
llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B dynemo.preprocessor.generate
```
**Terminal 4 - client**
......
......@@ -19,12 +19,12 @@ import asyncio
import uvloop
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
from .protocol import Request
@triton_worker()
@dynemo_worker()
async def worker(
runtime: DistributedRuntime,
component: str,
......@@ -36,9 +36,7 @@ async def worker(
Instantiate a `backend` client and call the `generate` endpoint
"""
# get endpoint
endpoint = (
runtime.namespace("triton-init").component(component).endpoint("generate")
)
endpoint = runtime.namespace("dynemo").component(component).endpoint("generate")
# create client
client = await endpoint.client()
......
......@@ -31,11 +31,7 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
......@@ -58,7 +54,7 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
self.kv_rank = self.kv_transfer_config.kv_rank
@triton_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, raw_request):
if self.engine_client is None:
await self.initialize()
......@@ -107,17 +103,17 @@ class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
await prefill_output
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("vllm")
component = runtime.namespace("dynemo").component("vllm")
await component.create_service()
prefill = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("prefill")
.endpoint("generate")
.client()
......
......@@ -24,11 +24,7 @@ from common.protocol import PrefillRequest, PrefillResponse
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
class VllmPrefillEngine(BaseVllmEngine):
......@@ -49,7 +45,7 @@ class VllmPrefillEngine(BaseVllmEngine):
self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
self.kv_rank = self.kv_transfer_config.kv_rank
@triton_endpoint(PrefillRequest, PrefillResponse)
@dynemo_endpoint(PrefillRequest, PrefillResponse)
async def generate(self, request):
if self.engine_client is None:
await self.initialize()
......@@ -66,13 +62,13 @@ class VllmPrefillEngine(BaseVllmEngine):
yield True
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("prefill")
component = runtime.namespace("dynemo").component("prefill")
await component.create_service()
async with VllmPrefillEngine(engine_args) as prefill_engine:
......
......@@ -20,12 +20,8 @@ import uvloop
from common.protocol import Request, Response
from vllm.logger import logger as vllm_logger
from triton_distributed.llm import KvRouter
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.llm import KvRouter
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
class Router:
......@@ -41,7 +37,7 @@ class Router:
self.router = router
self.workers_client = workers_client
@triton_endpoint(Request, Response)
@dynemo_endpoint(Request, Response)
async def generate(self, request):
lora_id = 0
worker_id = None
......@@ -73,7 +69,7 @@ class Router:
resp = resp.data() if hasattr(resp, "data") else resp
yield resp
@triton_endpoint(Request, Response)
@dynemo_endpoint(Request, Response)
async def mock_generate(self, request):
print(f"Received request: {request}")
yield "Hello, World!"
......@@ -82,10 +78,10 @@ class Router:
ROUTE_SELF = True
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
workers_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("vllm")
.endpoint("generate")
.client()
......@@ -102,7 +98,7 @@ async def worker(runtime: DistributedRuntime):
# simply be ignored, but before that, we will make sure that the services
# of the same namespace::component are created via KvMetricsPublisher,
# if it is also used to create endpoints.
kv_listener = runtime.namespace("triton-init").component("vllm")
kv_listener = runtime.namespace("dynemo").component("vllm")
await kv_listener.create_service()
router = KvRouter(runtime, kv_listener)
# i.e. below will cause panic
......@@ -111,7 +107,7 @@ async def worker(runtime: DistributedRuntime):
# Router(router, workers_client).mock_generate
# )
router_component = runtime.namespace("triton-init").component("frontend")
router_component = runtime.namespace("dynemo").component("frontend")
await router_component.create_service()
endpoint = router_component.endpoint("generate")
......
......@@ -22,15 +22,11 @@ import uvloop
from common.protocol import Request, Response
from vllm.logger import logger as vllm_logger
from triton_distributed.llm import KvMetricsPublisher
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.llm import KvMetricsPublisher
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
class TritonResult:
class DynemoResult:
OK = 0
ERR = 1
......@@ -43,19 +39,17 @@ class MockEngine:
def __init__(self, metrics_publisher, worker_id):
self.worker_id = worker_id
# KV events
self.lib = ctypes.CDLL("/opt/triton/llm_binding/lib/libtriton_llm_capi.so")
self.lib.triton_llm_init.argtypes = [c_char_p, c_char_p, c_int64]
self.lib.triton_llm_init.restype = c_uint32
result = self.lib.triton_llm_init(
"triton-init".encode(), "vllm".encode(), worker_id
)
if result == TritonResult.OK:
self.lib = ctypes.CDLL("/opt/dynemo/llm_binding/lib/libdynemo_llm_capi.so")
self.lib.dynemo_llm_init.argtypes = [c_char_p, c_char_p, c_int64]
self.lib.dynemo_llm_init.restype = c_uint32
result = self.lib.dynemo_llm_init("dynemo".encode(), "vllm".encode(), worker_id)
if result == DynemoResult.OK:
vllm_logger.info(
"KVCacheEventManager initialized successfully. Ready to publish KV Cache Events"
)
else:
vllm_logger.info("KVCacheEventManager initialization failed!")
self.lib.triton_kv_event_publish_stored.argtypes = [
self.lib.dynemo_kv_event_publish_stored.argtypes = [
ctypes.c_uint64, # event_id
ctypes.POINTER(ctypes.c_uint32), # token_ids
ctypes.POINTER(ctypes.c_size_t), # num_block_tokens
......@@ -64,18 +58,18 @@ class MockEngine:
ctypes.POINTER(ctypes.c_uint64), # parent_hash
ctypes.c_uint64, # lora_id
]
self.lib.triton_kv_event_publish_stored.restype = (
self.lib.dynemo_kv_event_publish_stored.restype = (
ctypes.c_uint32
) # triton_llm_result_t
) # dynemo_llm_result_t
self.lib.triton_kv_event_publish_removed.argtypes = [
self.lib.dynemo_kv_event_publish_removed.argtypes = [
ctypes.c_uint64, # event_id
ctypes.POINTER(ctypes.c_uint64), # block_ids
ctypes.c_size_t, # num_blocks
]
self.lib.triton_kv_event_publish_removed.restype = (
self.lib.dynemo_kv_event_publish_removed.restype = (
ctypes.c_uint32
) # triton_llm_result_t
) # dynemo_llm_result_t
# KV metrics
self.metrics_publisher = metrics_publisher
......@@ -95,7 +89,7 @@ class MockEngine:
self.event_id_counter = 0
self.tokens = [3] * 64
@triton_endpoint(Request, Response)
@dynemo_endpoint(Request, Response)
async def generate(self, request):
print(f"Received request: {request}")
self.request_active_slots = min(
......@@ -117,7 +111,7 @@ class MockEngine:
if self.event_id_counter > 0
else None
)
result = self.lib.triton_kv_event_publish_stored(
result = self.lib.dynemo_kv_event_publish_stored(
self.event_id_counter, # uint64_t event_id
(ctypes.c_uint32 * len(self.tokens))(
*self.tokens
......@@ -132,7 +126,7 @@ class MockEngine:
)
self.event_id_counter += 1
if result == TritonResult.OK:
if result == DynemoResult.OK:
vllm_logger.debug(f"Store - Published KV Event: {self.event_id_counter}")
else:
vllm_logger.debug(
......@@ -152,13 +146,13 @@ class MockEngine:
)
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("vllm")
component = runtime.namespace("dynemo").component("vllm")
metrics_publisher = KvMetricsPublisher()
await metrics_publisher.create_service(component)
......
......@@ -34,12 +34,7 @@ from vllm.logger import logger as vllm_logger
from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer
from triton_distributed.runtime import (
Client,
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import Client, DistributedRuntime, dynemo_endpoint, dynemo_worker
class RequestType(Enum):
......@@ -162,38 +157,38 @@ class Processor(ProcessMixIn):
f"Request type {request_type} not implemented"
)
@triton_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate_chat(self, raw_request):
async for response in self._generate(raw_request, RequestType.CHAT):
yield response
@triton_endpoint(CompletionRequest, CompletionStreamResponse)
@dynemo_endpoint(CompletionRequest, CompletionStreamResponse)
async def generate_completions(self, raw_request):
async for response in self._generate(raw_request, RequestType.COMPLETION):
yield response
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Set up clients to the router and workers.
Serve the triton-init.process.chat/completions endpoint.
Serve the dynemo.process.chat/completions endpoint.
"""
workers_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("vllm")
.endpoint("generate")
.client()
)
router_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("router")
.endpoint("generate")
.client()
)
preprocess_component = runtime.namespace("triton-init").component("process")
preprocess_component = runtime.namespace("dynemo").component("process")
await preprocess_component.create_service()
chat_endpoint = preprocess_component.endpoint("chat/completions")
......
......@@ -23,12 +23,8 @@ import uvloop
from common.protocol import Tokens
from vllm.logger import logger as vllm_logger
from triton_distributed.llm import KvRouter
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.llm import KvRouter
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
WorkerId = str
......@@ -55,7 +51,7 @@ class Router:
self.router = router
self.routing_strategy = routing_strategy
@triton_endpoint(Tokens, WorkerId)
@dynemo_endpoint(Tokens, WorkerId)
async def generate(self, request) -> AsyncIterator[WorkerId]:
lora_id = 0
worker_id = None
......@@ -82,14 +78,14 @@ class Router:
)
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, args: Namespace):
"""
Set up the worker clients.
Serve the triton-init.router.generate endpoint.
Serve the dynemo.router.generate endpoint.
"""
workers_client = (
await runtime.namespace("triton-init")
await runtime.namespace("dynemo")
.component("vllm")
.endpoint("generate")
.client()
......@@ -114,10 +110,10 @@ async def worker(runtime: DistributedRuntime, args: Namespace):
+ "\n".join(f"id: {id}" for id in workers_client.endpoint_ids())
)
kv_listener = runtime.namespace("triton-init").component("vllm")
kv_listener = runtime.namespace("dynemo").component("vllm")
await kv_listener.create_service()
router_component = runtime.namespace("triton-init").component("router")
router_component = runtime.namespace("dynemo").component("router")
await router_component.create_service()
router = KvRouter(runtime, kv_listener)
......
......@@ -25,12 +25,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.logger import logger as vllm_logger
from vllm.sampling_params import RequestOutputKind
from triton_distributed.llm import KvMetricsPublisher
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.llm import KvMetricsPublisher
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
vllm_logger.info(f"VLLM_KV_CAPI_PATH: {os.environ['VLLM_KV_CAPI_PATH']}")
......@@ -52,7 +48,7 @@ class VllmEngine(BaseVllmEngine):
assert self.engine_client is not None, "engine_client was not initialized"
self.engine_client.set_metrics_publisher(self.metrics_publisher)
@triton_endpoint(vLLMGenerateRequest, MyRequestOutput)
@dynemo_endpoint(vLLMGenerateRequest, MyRequestOutput)
async def generate(self, request) -> AsyncIterator:
assert (
self.engine_client is not None
......@@ -77,12 +73,12 @@ class VllmEngine(BaseVllmEngine):
).model_dump_json()
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Serve the triton-init.vllm.generate endpoint.
Serve the dynemo.vllm.generate endpoint.
"""
worker_component = runtime.namespace("triton-init").component("vllm")
worker_component = runtime.namespace("dynemo").component("vllm")
await worker_component.create_service()
worker_endpoint = worker_component.endpoint("generate")
......@@ -91,7 +87,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
VLLM_KV_NAMESPACE = "triton-init"
VLLM_KV_NAMESPACE = "dynemo"
os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
VLLM_KV_COMPONENT = "vllm"
......
......@@ -28,11 +28,7 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.logger import logger as vllm_logger
from triton_distributed.runtime import (
DistributedRuntime,
triton_endpoint,
triton_worker,
)
from dynemo.runtime import DistributedRuntime, dynemo_endpoint, dynemo_worker
class VllmEngine(BaseVllmEngine, ProcessMixIn):
......@@ -43,7 +39,7 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn):
def __init__(self, engine_args: AsyncEngineArgs):
super().__init__(engine_args)
@triton_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
@dynemo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
async def generate(self, raw_request):
if self.engine_client is None:
await self.initialize()
......@@ -75,13 +71,13 @@ class VllmEngine(BaseVllmEngine, ProcessMixIn):
yield response
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("vllm")
component = runtime.namespace("dynemo").component("vllm")
await component.create_service()
endpoint = component.endpoint("generate")
......
......@@ -19,27 +19,27 @@ import asyncio
import uvloop
from preprocessor.common import parse_vllm_args
from triton_distributed.runtime import (
from dynemo.runtime import (
DistributedRuntime,
ModelDeploymentCard,
OAIChatPreprocessor,
triton_worker,
dynemo_worker,
)
uvloop.install()
@triton_worker()
@dynemo_worker()
async def preprocessor(runtime: DistributedRuntime, model_name: str, model_path: str):
# create model deployment card
mdc = await ModelDeploymentCard.from_local_path(model_path, model_name)
# create preprocessor endpoint
component = runtime.namespace("triton-init").component("preprocessor")
component = runtime.namespace("dynemo").component("preprocessor")
await component.create_service()
endpoint = component.endpoint("generate")
# create backend endpoint
backend = runtime.namespace("triton-init").component("backend").endpoint("generate")
backend = runtime.namespace("dynemo").component("backend").endpoint("generate")
# start preprocessor service with next backend
chat = OAIChatPreprocessor(mdc, endpoint, next=backend)
......
......@@ -28,12 +28,12 @@ from vllm.entrypoints.openai.api_server import (
)
from vllm.outputs import CompletionOutput
from triton_distributed.runtime import (
from dynemo.runtime import (
Backend,
DistributedRuntime,
ModelDeploymentCard,
triton_endpoint,
triton_worker,
dynemo_endpoint,
dynemo_worker,
)
finish_reason_map = {
......@@ -107,7 +107,7 @@ class VllmEngine(AsyncContextDecorator):
}
return SamplingParams(**sampling_params)
@triton_endpoint(Any, CompletionOutput)
@dynemo_endpoint(Any, CompletionOutput)
async def generate(self, request):
state = DeltaState()
request_id = str(uuid.uuid4())
......@@ -122,13 +122,13 @@ class VllmEngine(AsyncContextDecorator):
yield self.to_backend_output(choice, delta_token_ids)
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: NvAsyncEngineArgs):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace("triton-init").component("backend")
component = runtime.namespace("dynemo").component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
......
......@@ -21,12 +21,12 @@
if [ $# -gt 2 ]; then
echo "Usage: $0 [model_name] [endpoint_name]"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: triton-init.vllm.generate)"
echo "Optional: endpoint_name (default: dynemo.vllm.generate)"
exit 1
fi
MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${2:-"triton-init.vllm.generate"}
ENDPOINT_NAME=${2:-"dynemo.vllm.generate"}
SESSION_NAME="vllm_disagg"
WORKDIR="$(dirname $0)/.."
INIT_CMD="cd $WORKDIR"
......@@ -49,7 +49,7 @@ tmux split-window -v
########################################################
HTTP_HOST="localhost"
HTTP_PORT=8080
HTTP_CMD="TRD_LOG=DEBUG http --host ${HTTP_HOST} --port ${HTTP_PORT}"
HTTP_CMD="DYN_LOG=DEBUG http --host ${HTTP_HOST} --port ${HTTP_PORT}"
tmux select-pane -t 0
tmux send-keys "$INIT_CMD && $HTTP_CMD" C-m
......
......@@ -22,18 +22,18 @@ if [ $# -lt 2 ]; then
echo "Usage: $0 <number_of_workers> <routing_strategy> [model_name] [endpoint_name]"
echo "Error: Must specify at least number of workers and routing strategy"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: triton-init.process.chat/completions)"
echo "Optional: endpoint_name (default: dynemo.process.chat/completions)"
exit 1
fi
NUM_WORKERS=$1
ROUTING_STRATEGY=$2
MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${4:-"triton-init.process.chat/completions"}
ENDPOINT_NAME=${4:-"dynemo.process.chat/completions"}
VALID_STRATEGIES=("prefix")
SESSION_NAME="v"
WORKDIR="/workspace/examples/python_rs/llm/vllm"
INIT_CMD="source /opt/triton/venv/bin/activate && cd $WORKDIR"
INIT_CMD="source /opt/dynemo/venv/bin/activate && cd $WORKDIR"
if [[ ! " ${VALID_STRATEGIES[@]} " =~ " ${ROUTING_STRATEGY} " ]]; then
echo "Error: Invalid routing strategy. Must be one of: ${VALID_STRATEGIES[*]}"
......@@ -42,7 +42,7 @@ fi
########################################################
# HTTP Server
########################################################
HTTP_CMD="TRD_LOG=DEBUG http"
HTTP_CMD="DYN_LOG=DEBUG http"
tmux new-session -d -s "$SESSION_NAME-http"
tmux send-keys -t "$SESSION_NAME-http" "$INIT_CMD && $HTTP_CMD" C-m
......
......@@ -33,7 +33,7 @@ All of the commands below are run inside the same container.
## Run deployment
Add model to triton and start http server.
Add model to dynemo and start http server.
In terminal 0:
```
......@@ -65,7 +65,7 @@ CUDA_VISIBLE_DEVICES=0 python prefill_worker.py \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--enforce-eager \
--kv-transfer-config \
'{"kv_connector":"TritonNixlConnector"}'
'{"kv_connector":"DynemoNixlConnector"}'
```
In terminal 2:
......@@ -78,7 +78,7 @@ CUDA_VISIBLE_DEVICES=1,2 python3 worker.py \
--enforce-eager \
--tensor-parallel-size 2 \
--kv-transfer-config \
'{"kv_connector":"TritonNixlConnector"}'
'{"kv_connector":"DynemoNixlConnector"}'
```
......@@ -157,7 +157,7 @@ rm -r /tmp/nixl
- [x] Zero copy
- [x] Conditional remote prefill
- [x] Manual example with tp > 1
- [x] Run on triton distributed runtime
- [x] Run on dynemo distributed runtime
- [x] add oai http endpoint
- [x] Sample only on decode, do note return remote prefill response
- [x] Check if all transfers finished before moving to decode
......
......@@ -22,7 +22,7 @@ from vllm.distributed.device_communicators.nixl import NixlMetadata
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
from triton_distributed.runtime import DistributedRuntime
from dynemo.runtime import DistributedRuntime
METADATA_DIR = "/tmp/nixl"
......
......@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.api_server import (
from vllm.inputs.data import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from triton_distributed.runtime import DistributedRuntime, triton_worker
from dynemo.runtime import DistributedRuntime, dynemo_worker
class RequestHandler:
......@@ -71,7 +71,7 @@ class RequestHandler:
yield
@triton_worker()
@dynemo_worker()
async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
component = runtime.namespace("test-nixl").component("prefill")
await component.create_service()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment