Unverified Commit 5f179186 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

refactor: Migrate to new UX2 for python launch (#2003)

parent fc124360
...@@ -19,12 +19,12 @@ trap cleanup EXIT INT TERM ...@@ -19,12 +19,12 @@ trap cleanup EXIT INT TERM
# run clear_namespace # run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo python3 utils/clear_namespace.py --namespace dynamo
# run ingress # run frontend
dynamo run in=http out=dyn --http-port=8000 & python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
python3 components/worker.py \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" --extra-engine-args "$AGG_ENGINE_ARGS"
...@@ -19,12 +19,12 @@ trap cleanup EXIT INT TERM ...@@ -19,12 +19,12 @@ trap cleanup EXIT INT TERM
# run clear_namespace # run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo python3 utils/clear_namespace.py --namespace dynamo
# run ingress # run frontend
dynamo run in=http out=dyn --router-mode kv --http-port=8000 & python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run worker # run worker
python3 components/worker.py \ python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \ --extra-engine-args "$AGG_ENGINE_ARGS" \
......
...@@ -23,12 +23,12 @@ trap cleanup EXIT INT TERM ...@@ -23,12 +23,12 @@ trap cleanup EXIT INT TERM
# run clear_namespace # run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo python3 utils/clear_namespace.py --namespace dynamo
# run ingress # run frontend
dynamo run in=http out=dyn --http-port=8000 & python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
# run prefill worker # run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \
...@@ -37,7 +37,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py ...@@ -37,7 +37,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \
......
...@@ -23,8 +23,8 @@ trap cleanup EXIT INT TERM ...@@ -23,8 +23,8 @@ trap cleanup EXIT INT TERM
# run clear_namespace # run clear_namespace
python3 utils/clear_namespace.py --namespace dynamo python3 utils/clear_namespace.py --namespace dynamo
# run ingress # run frontend
dynamo run in=http out=dyn --router-mode kv --http-port=8000 & python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
DYNAMO_PID=$! DYNAMO_PID=$!
...@@ -37,7 +37,7 @@ else ...@@ -37,7 +37,7 @@ else
fi fi
# run prefill worker # run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py \ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \
...@@ -47,7 +47,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py ...@@ -47,7 +47,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 components/worker.py
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 components/worker.py \ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \ --model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \ --served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \
......
...@@ -13,4 +13,4 @@ sleep 3 ...@@ -13,4 +13,4 @@ sleep 3
# Start OpenAI Frontend which will dynamically discover workers when they startup # Start OpenAI Frontend which will dynamically discover workers when they startup
# NOTE: This is a blocking call. # NOTE: This is a blocking call.
dynamo-run in=http out=dyn --http-port 8000 python3 -m dynamo.frontend --http-port 8000
...@@ -39,7 +39,7 @@ if [[ -n ${DISAGGREGATION_STRATEGY} ]]; then ...@@ -39,7 +39,7 @@ if [[ -n ${DISAGGREGATION_STRATEGY} ]]; then
fi fi
trtllm-llmapi-launch \ trtllm-llmapi-launch \
python3 /mnt/components/worker.py \ python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \ --model-path "${MODEL_PATH}" \
--served-model-name "${SERVED_MODEL_NAME}" \ --served-model-name "${SERVED_MODEL_NAME}" \
--extra-engine-args "${ENGINE_CONFIG}" \ --extra-engine-args "${ENGINE_CONFIG}" \
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.trtllm.main import main
if __name__ == "__main__":
main()
...@@ -3,10 +3,8 @@ ...@@ -3,10 +3,8 @@
import asyncio import asyncio
import logging import logging
import os
import signal import signal
import sys import sys
from typing import TYPE_CHECKING
import uvloop import uvloop
from tensorrt_llm import SamplingParams from tensorrt_llm import SamplingParams
...@@ -21,48 +19,16 @@ from dynamo.llm import ( ...@@ -21,48 +19,16 @@ from dynamo.llm import (
) )
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.utils.request_handlers.handlers import (
if TYPE_CHECKING:
from utils.trtllm_utils import Config
def _setup_path_and_imports():
"""Setup path and import utils modules"""
# Add the parent directory to the Python path so we can import utils
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from utils.request_handlers.handlers import (
RequestHandlerConfig,
RequestHandlerFactory,
)
from utils.trtllm_utils import (
Config,
cmd_line_args,
is_first_worker,
parse_endpoint,
)
return (
RequestHandlerConfig,
RequestHandlerFactory,
Config,
cmd_line_args,
is_first_worker,
parse_endpoint,
)
# Import utils modules
(
RequestHandlerConfig, RequestHandlerConfig,
RequestHandlerFactory, RequestHandlerFactory,
)
from dynamo.trtllm.utils.trtllm_utils import (
Config, Config,
cmd_line_args, cmd_line_args,
is_first_worker, is_first_worker,
parse_endpoint, parse_endpoint,
) = _setup_path_and_imports() )
# Default buffer size for kv cache events. # Default buffer size for kv cache events.
DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024 DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024
...@@ -205,6 +171,9 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -205,6 +171,9 @@ async def init(runtime: DistributedRuntime, config: Config):
await endpoint.serve_endpoint(handler.generate) await endpoint.serve_endpoint(handler.generate)
def main():
uvloop.run(worker())
if __name__ == "__main__": if __name__ == "__main__":
uvloop.install() main()
asyncio.run(worker())
...@@ -19,11 +19,14 @@ from enum import Enum ...@@ -19,11 +19,14 @@ from enum import Enum
from tensorrt_llm import SamplingParams from tensorrt_llm import SamplingParams
from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
from utils.disagg_utils import DisaggregatedParams, DisaggregatedParamsCodec
from dynamo.llm.tensorrtllm.engine import TensorRTLLMEngine from dynamo.llm.tensorrtllm.engine import TensorRTLLMEngine
from dynamo.llm.tensorrtllm.publisher import Publisher from dynamo.llm.tensorrtllm.publisher import Publisher
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.utils.disagg_utils import (
DisaggregatedParams,
DisaggregatedParamsCodec,
)
configure_dynamo_logging() configure_dynamo_logging()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import copy import copy
from utils.request_handlers.handler_base import ( from dynamo.trtllm.utils.request_handlers.handler_base import (
DisaggregationMode, DisaggregationMode,
DisaggregationStrategy, DisaggregationStrategy,
HandlerBase, HandlerBase,
......
...@@ -4,14 +4,14 @@ ...@@ -4,14 +4,14 @@
import argparse import argparse
from typing import Optional from typing import Optional
from utils.request_handlers.handler_base import ( from dynamo.trtllm.utils.request_handlers.handler_base import (
DisaggregationMode, DisaggregationMode,
DisaggregationStrategy, DisaggregationStrategy,
) )
# Default endpoint for the next worker. # Default endpoint for the next worker.
DEFAULT_ENDPOINT = "dyn://dynamo.tensorrt_llm.generate" DEFAULT_ENDPOINT = "dyn://dynamo.tensorrt_llm.generate"
DEFAULT_MODEL_PATH = "TinyLlama-1.1B-Instruct" DEFAULT_MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEFAULT_NEXT_ENDPOINT = "dyn://dynamo.tensorrt_llm_next.generate" DEFAULT_NEXT_ENDPOINT = "dyn://dynamo.tensorrt_llm_next.generate"
DEFAULT_DISAGGREGATION_STRATEGY = DisaggregationStrategy.DECODE_FIRST DEFAULT_DISAGGREGATION_STRATEGY = DisaggregationStrategy.DECODE_FIRST
DEFAULT_DISAGGREGATION_MODE = DisaggregationMode.AGGREGATED DEFAULT_DISAGGREGATION_MODE = DisaggregationMode.AGGREGATED
...@@ -123,7 +123,7 @@ def cmd_line_args(): ...@@ -123,7 +123,7 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--publish-events-and-metrics", "--publish-events-and-metrics",
action="store_true", action="store_true",
help="Publish events and metrics to the dynamo components. Note: This is not supported when running in prefill disaggregation mode.", help="If set, publish events and metrics to the dynamo components.",
) )
parser.add_argument( parser.add_argument(
"--disaggregation-mode", "--disaggregation-mode",
......
...@@ -79,7 +79,7 @@ requires = ["hatchling"] ...@@ -79,7 +79,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build" build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/frontend/src/dynamo", "components/backends/llama_cpp/src/dynamo", "components/backends/mocker/src/dynamo"] packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/frontend/src/dynamo", "components/backends/llama_cpp/src/dynamo", "components/backends/mocker/src/dynamo", "components/backends/trtllm/src/dynamo"]
# This section is for including the binaries in the wheel package # This section is for including the binaries in the wheel package
# but doesn't make them executable scripts in the venv bin directory # but doesn't make them executable scripts in the venv bin directory
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment