Unverified Commit 5f179186 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

refactor: Migrate to new UX2 for python launch (#2003)

parent fc124360
......@@ -19,12 +19,12 @@ trap cleanup EXIT INT TERM
# run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo
# run ingress
dynamo run in=http out=dyn --http-port=8000 &
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!
# run worker
python3 components/worker.py \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS"
......@@ -19,12 +19,12 @@ trap cleanup EXIT INT TERM
# run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo
# run ingress
dynamo run in=http out=dyn --router-mode kv --http-port=8000 &
# run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
DYNAMO_PID=$!
# run worker
python3 components/worker.py \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
......
......@@ -23,12 +23,12 @@ trap cleanup EXIT INT TERM
# run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo
# run ingress
dynamo run in=http out=dyn --http-port=8000 &
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
......@@ -37,7 +37,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
......
......@@ -23,8 +23,8 @@ trap cleanup EXIT INT TERM
# run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo
# run ingress
dynamo run in=http out=dyn --router-mode kv --http-port=8000 &
# run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
DYNAMO_PID=$!
......@@ -37,7 +37,7 @@ else
fi
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
......@@ -47,7 +47,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
......
......@@ -13,4 +13,4 @@ sleep 3
# Start OpenAI Frontend which will dynamically discover workers when they startup
# NOTE: This is a blocking call.
dynamo-run in=http out=dyn --http-port 8000
python3 -m dynamo.frontend --http-port 8000
......@@ -39,7 +39,7 @@ if [[ -n ${DISAGGREGATION_STRATEGY} ]]; then
fi
trtllm-llmapi-launch \
python3 /mnt/components/worker.py \
python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \
--served-model-name "${SERVED_MODEL_NAME}" \
--extra-engine-args "${ENGINE_CONFIG}" \
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.trtllm.main import main
if __name__ == "__main__":
main()
......@@ -3,10 +3,8 @@
import asyncio
import logging
import os
import signal
import sys
from typing import TYPE_CHECKING
import uvloop
from tensorrt_llm import SamplingParams
......@@ -21,48 +19,16 @@ from dynamo.llm import (
)
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
if TYPE_CHECKING:
from utils.trtllm_utils import Config
def _setup_path_and_imports():
"""Setup path and import utils modules"""
# Add the parent directory to the Python path so we can import utils
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from utils.request_handlers.handlers import (
RequestHandlerConfig,
RequestHandlerFactory,
)
from utils.trtllm_utils import (
Config,
cmd_line_args,
is_first_worker,
parse_endpoint,
)
return (
RequestHandlerConfig,
RequestHandlerFactory,
Config,
cmd_line_args,
is_first_worker,
parse_endpoint,
)
# Import utils modules
(
from dynamo.trtllm.utils.request_handlers.handlers import (
RequestHandlerConfig,
RequestHandlerFactory,
)
from dynamo.trtllm.utils.trtllm_utils import (
Config,
cmd_line_args,
is_first_worker,
parse_endpoint,
) = _setup_path_and_imports()
)
# Default buffer size for kv cache events.
DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024
......@@ -205,6 +171,9 @@ async def init(runtime: DistributedRuntime, config: Config):
await endpoint.serve_endpoint(handler.generate)
def main():
uvloop.run(worker())
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
main()
......@@ -19,11 +19,14 @@ from enum import Enum
from tensorrt_llm import SamplingParams
from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
from utils.disagg_utils import DisaggregatedParams, DisaggregatedParamsCodec
from dynamo.llm.tensorrtllm.engine import TensorRTLLMEngine
from dynamo.llm.tensorrtllm.publisher import Publisher
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.utils.disagg_utils import (
DisaggregatedParams,
DisaggregatedParamsCodec,
)
configure_dynamo_logging()
......
......@@ -3,7 +3,7 @@
import copy
from utils.request_handlers.handler_base import (
from dynamo.trtllm.utils.request_handlers.handler_base import (
DisaggregationMode,
DisaggregationStrategy,
HandlerBase,
......
......@@ -4,14 +4,14 @@
import argparse
from typing import Optional
from utils.request_handlers.handler_base import (
from dynamo.trtllm.utils.request_handlers.handler_base import (
DisaggregationMode,
DisaggregationStrategy,
)
# Default endpoint for the next worker.
DEFAULT_ENDPOINT = "dyn://dynamo.tensorrt_llm.generate"
DEFAULT_MODEL_PATH = "TinyLlama-1.1B-Instruct"
DEFAULT_MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEFAULT_NEXT_ENDPOINT = "dyn://dynamo.tensorrt_llm_next.generate"
DEFAULT_DISAGGREGATION_STRATEGY = DisaggregationStrategy.DECODE_FIRST
DEFAULT_DISAGGREGATION_MODE = DisaggregationMode.AGGREGATED
......@@ -123,7 +123,7 @@ def cmd_line_args():
parser.add_argument(
"--publish-events-and-metrics",
action="store_true",
help="Publish events and metrics to the dynamo components. Note: This is not supported when running in prefill disaggregation mode.",
help="If set, publish events and metrics to the dynamo components.",
)
parser.add_argument(
"--disaggregation-mode",
......
......@@ -79,7 +79,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/frontend/src/dynamo", "components/backends/llama_cpp/src/dynamo", "components/backends/mocker/src/dynamo"]
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/frontend/src/dynamo", "components/backends/llama_cpp/src/dynamo", "components/backends/mocker/src/dynamo", "components/backends/trtllm/src/dynamo"]
# This section is for including the binaries in the wheel package
# but doesn't make them executable scripts in the venv bin directory
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment