Unverified Commit 03d976c7 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

refactor: Refactor the TRTLLM example components and improve UI (#1654)


Signed-off-by: default avatarTanmay Verma <tanmayv@nvidia.com>
parent 8a2d6529
......@@ -110,7 +110,7 @@ dynamo serve graphs.agg:Frontend -f ./configs/agg.yaml
#### Aggregated serving with KV Routing
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
dynamo serve graphs.agg:Frontend -f ./configs/agg_router.yaml
```
#### Disaggregated serving
......@@ -122,7 +122,7 @@ dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
#### Disaggregated serving with KV Routing
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
dynamo serve graphs.disagg:Frontend -f ./configs/disagg_router.yaml
```
#### Aggregated serving with Multi-Token Prediction (MTP) and DeepSeek R1
......
This diff is collapsed.
......@@ -14,136 +14,28 @@
# limitations under the License.
import argparse
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Tuple
import yaml
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
@dataclass
class LLMAPIConfig:
def __init__(
self,
model_name: str,
model_path: str | None = None,
pytorch_backend_config: PyTorchConfig | None = None,
kv_cache_config: KvCacheConfig | None = None,
speculative_config: DecodingBaseConfig | None = None,
**kwargs,
):
self.model_name = model_name
self.model_path = model_path
self.pytorch_backend_config = pytorch_backend_config
self.kv_cache_config = kv_cache_config
self.speculative_config = speculative_config
self.extra_args = kwargs
# Hardcoded to skip tokenizer init for now.
# We will handle the tokenization/detokenization
# in the base engine.
if "skip_tokenizer_init" in self.extra_args:
self.extra_args.pop("skip_tokenizer_init")
self.skip_tokenizer_init = True
def to_dict(self) -> Dict[str, Any]:
data = {
"kv_cache_config": self.kv_cache_config,
"speculative_config": self.speculative_config,
"skip_tokenizer_init": self.skip_tokenizer_init,
}
if self.extra_args:
data.update(self.extra_args)
return data
def update_sub_configs(self, other_config: Dict[str, Any]):
# TODO: Consider removing pytorch_backend_config parsing as this section
# was collapsed to top level config fields in recent TRTLLM versions.
if "pytorch_backend_config" in other_config:
self.pytorch_backend_config = PyTorchConfig(
**other_config["pytorch_backend_config"]
)
self.extra_args.pop("pytorch_backend_config", None)
if "kv_cache_config" in other_config:
self.kv_cache_config = KvCacheConfig(**other_config["kv_cache_config"])
self.extra_args.pop("kv_cache_config", None)
if "speculative_config" in other_config:
self.speculative_config = DecodingBaseConfig.from_dict(
other_config["speculative_config"]
)
self.extra_args.pop("speculative_config", None)
def _get_llm_args(engine_config):
# Only do model validation checks and leave other checks to LLMAPI
if "model_name" not in engine_config:
raise ValueError("Model name is required in the TRT-LLM engine config.")
if engine_config.get("model_path", ""):
if os.path.exists(engine_config.get("model_path", "")):
engine_config["model_path"] = Path(engine_config["model_path"])
else:
raise ValueError(f"Model path {engine_config['model_path']} does not exist")
model_name = engine_config["model_name"]
model_path = engine_config.get("model_path", None)
engine_config.pop("model_name")
engine_config.pop("model_path", None)
# Store all other args as kwargs
llm_api_config = LLMAPIConfig(
model_name=model_name,
model_path=model_path,
**engine_config,
)
# Parse supported sub configs and remove from kwargs
llm_api_config.update_sub_configs(engine_config)
return llm_api_config
def _init_engine_args(engine_args_filepath):
"""Initialize engine arguments from config file."""
if not os.path.isfile(engine_args_filepath):
raise ValueError(
"'YAML file containing TRT-LLM engine args must be provided in when launching the worker."
)
try:
with open(engine_args_filepath) as file:
trtllm_engine_config = yaml.safe_load(file)
except yaml.YAMLError as e:
raise RuntimeError(f"Failed to parse engine config: {e}")
return _get_llm_args(trtllm_engine_config)
def parse_tensorrt_llm_args(
config_args,
) -> Tuple[Any, Tuple[Dict[str, Any], Dict[str, Any]]]:
) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="A TensorRT-LLM Worker parser")
parser.add_argument(
"--engine_args", type=str, required=True, help="Path to the engine args file"
"--extra-engine-args",
type=str,
default="",
help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.",
)
parser.add_argument(
"--served_model_name",
"--model-path",
type=str,
help="Name of the model to serve",
default=None,
help="Path to disk model or HuggingFace model identifier to load.",
)
parser.add_argument(
"--llmapi-disaggregated-config",
"-c",
"--served_model_name",
type=str,
help="Path to the llmapi disaggregated config file",
default=None,
help="Name to serve the model under.",
)
parser.add_argument(
"--router",
......@@ -152,46 +44,19 @@ def parse_tensorrt_llm_args(
default="random",
help="Router type to use for scheduling requests to workers",
)
parser.add_argument(
"--min-workers",
type=int,
default=1,
help="Minimum number of workers for aggregated (monolith) server",
)
parser.add_argument(
"--min-prefill-workers",
type=int,
default=1,
help="Minimum number of prefill workers for disaggregated server",
)
parser.add_argument(
"--block-size",
"--kv-block-size",
type=int,
default=32,
help="Number of tokens per KV block in TRTLLM worker. Default is 32 for pytorch backend.",
)
parser.add_argument(
"--remote-prefill",
action="store_true",
help="Use remote prefill workers for generation server in Disaggregated mode.",
)
args = parser.parse_args(config_args)
return (args, _init_engine_args(args.engine_args))
def parse_dynamo_run_args() -> Tuple[Any, Tuple[Dict[str, Any], Dict[str, Any]]]:
parser = argparse.ArgumentParser(
description="A TensorRT-LLM Dynamo-run engine parser"
)
parser.add_argument(
"--engine_args", type=str, required=True, help="Path to the engine args file"
)
parser.add_argument(
"--publish-kv-cache-events",
"--enable-disagg",
action="store_true",
help="Publish KV cache events from TensorRT-LLM. Currently, only supported for context worker in Disaggregated mode.",
help="Enable remote prefill for the worker",
)
args, _ = parser.parse_known_args()
return (args, _init_engine_args(args.engine_args))
args = parser.parse_args(config_args)
return args
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import threading
import traceback
import weakref
from enum import Enum
from queue import Queue
from typing import Any, Callable, Coroutine, Optional, TypedDict, Union
logger = logging.getLogger(__name__)
AsyncTask = Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
class RoutingStrategy(Enum):
ROUND_ROBIN = "round_robin"
RANDOM = "random"
PREFIX = "prefix"
class RequestType(Enum):
CHAT = "chat"
COMPLETION = "completion"
class ServerType(Enum):
# Generation server used for disaggregated and aggregated requests
GEN = "gen"
# Context server used for disaggregated requests
CTX = "ctx"
# Dynamo run server used for Dynamo run requests
DYN_RUN = "dyn_run"
class ConversationMessage(TypedDict):
role: str
content: str
class ManagedThread(threading.Thread):
def __init__(
self,
task: Optional[AsyncTask],
error_queue: Optional[Queue] = None,
name: Optional[str] = None,
loop: Optional[asyncio.AbstractEventLoop] = None,
**kwargs,
):
super().__init__(name=name)
self.task = task
self.error_queue = error_queue
self.kwargs = kwargs
self.loop = loop
self.daemon = True
self.stop_event = threading.Event()
def set_loop(self, loop: asyncio.AbstractEventLoop):
self.loop = loop
def run(self):
while not self.stop_event.is_set():
task: Optional[AsyncTask] = self.task
if isinstance(task, weakref.WeakMethod):
task = task()
if task is None:
# Normally, this should not happen.
logger.warning("WeakMethod is expired.")
break
if task is None:
break
try:
if self.loop is None:
logger.error("[ManagedThread] Loop not initialized!")
break
future = asyncio.run_coroutine_threadsafe(
task(**self.kwargs), self.loop
)
_ = future.result()
except Exception as e:
logger.error(
f"Error in thread {self.name}: {e}\n{traceback.format_exc()}"
)
if self.error_queue is not None:
self.error_queue.put(e)
logger.info(f"Thread {self.name} stopped.")
def stop(self):
self.stop_event.set()
......@@ -12,15 +12,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
from common.base_engine import BaseTensorrtLLMEngine
from common.base_engine import BaseEngineConfig, BaseTensorrtLLMEngine
from common.parser import parse_tensorrt_llm_args
from common.protocol import TRTLLMWorkerRequest
from common.utils import ServerType
from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
from dynamo.sdk import async_on_start, dynamo_context, endpoint, on_shutdown, service
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
......@@ -39,34 +37,37 @@ class TensorRTLLMPrefillWorker(BaseTensorrtLLMEngine):
class_name = self.__class__.__name__
config = ServiceConfig.get_instance()
config_args = config.as_args(class_name, prefix="")
args, engine_config = parse_tensorrt_llm_args(config_args)
worker_id = dynamo_context["endpoints"][0].lease_id()
super().__init__(
namespace_str="dynamo",
component_str=class_name,
worker_id=worker_id,
engine_config=engine_config,
remote_prefill=args.remote_prefill,
min_workers=args.min_workers,
disagg_config_file=args.llmapi_disaggregated_config,
block_size=args.block_size,
router=args.router,
server_type=ServerType.CTX,
args = parse_tensorrt_llm_args(config_args)
lease_id = dynamo_context["endpoints"][0].lease_id()
namespace, _ = TensorRTLLMPrefillWorker.dynamo_address() # type: ignore
engine_config = BaseEngineConfig(
namespace=namespace,
component=class_name,
endpoint="generate",
model_path=args.model_path,
served_model_name=args.served_model_name,
kv_block_size=args.kv_block_size,
extra_engine_args=args.extra_engine_args,
publish_events_and_metrics=False,
disaggregation_mode="prefill",
remote_prefill_endpoint=None,
lease_id=lease_id,
)
super().__init__(config=engine_config)
@async_on_start
async def async_init(self):
self._init_engine()
if self._kv_metrics_publisher is not None:
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(
lambda _: logger.info("metrics publisher endpoint created")
)
runtime = dynamo_context["runtime"]
await self.initialize(runtime)
logger.info("TensorRT-LLM Prefill Worker initialized")
async def create_metrics_publisher_endpoint(self):
component = dynamo_context["component"]
await self.kv_metrics_publisher.create_endpoint(component)
@on_shutdown
async def async_cleanup(self):
logger.info("Cleaning up TensorRT-LLM Prefill Worker")
await self.cleanup()
logger.info("TensorRT-LLM Prefill Worker cleanup completed")
@endpoint()
async def generate(self, request: TRTLLMWorkerRequest):
......
......@@ -12,17 +12,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
from common.base_engine import BaseTensorrtLLMEngine
from common.base_engine import BaseEngineConfig, BaseTensorrtLLMEngine
from common.parser import parse_tensorrt_llm_args
from common.protocol import TRTLLMWorkerRequest
from common.utils import ServerType
from components.prefill_worker import TensorRTLLMPrefillWorker
from dynamo.llm import ModelType, register_llm
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
from dynamo.sdk import (
async_on_start,
depends,
dynamo_context,
endpoint,
on_shutdown,
service,
)
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
......@@ -43,74 +48,66 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine):
class_name = self.__class__.__name__
config = ServiceConfig.get_instance()
config_args = config.as_args(class_name, prefix="")
args, engine_config = parse_tensorrt_llm_args(config_args)
self.served_model_name = args.served_model_name
worker_id = dynamo_context["endpoints"][0].lease_id()
args = parse_tensorrt_llm_args(config_args)
lease_id = dynamo_context["endpoints"][0].lease_id()
namespace, _ = TensorRTLLMWorker.dynamo_address() # type: ignore
self._min_prefill_workers = args.min_prefill_workers
super().__init__(
namespace_str=namespace,
component_str=class_name,
worker_id=worker_id,
engine_config=engine_config,
remote_prefill=args.remote_prefill,
min_workers=args.min_workers,
disagg_config_file=args.llmapi_disaggregated_config,
block_size=args.block_size,
router=args.router,
server_type=ServerType.GEN,
endpoint_name = "generate"
publish_events_and_metrics = args.router == "kv"
prefill_class_name = "TensorRTLLMPrefillWorker"
if args.enable_disagg:
disaggregation_mode = "decode"
else:
disaggregation_mode = "prefill_and_decode"
engine_config = BaseEngineConfig(
namespace=namespace,
component=class_name,
endpoint=endpoint_name,
model_path=args.model_path,
served_model_name=args.served_model_name,
kv_block_size=args.kv_block_size,
extra_engine_args=args.extra_engine_args,
publish_events_and_metrics=publish_events_and_metrics,
disaggregation_mode=disaggregation_mode,
remote_prefill_endpoint=f"dyn://{namespace}.{prefill_class_name}.generate",
lease_id=lease_id,
)
super().__init__(config=engine_config)
@async_on_start
async def async_init(self):
self._init_engine()
runtime = dynamo_context["runtime"]
await self.initialize(runtime)
logger.info("Registering LLM for discovery")
comp_ns, comp_name = TensorRTLLMWorker.dynamo_address() # type: ignore
endpoint = runtime.namespace(comp_ns).component(comp_name).endpoint("generate")
endpoint = (
runtime.namespace(self._config.namespace)
.component(self._config.component)
.endpoint(self._config.endpoint)
)
try:
await register_llm(
ModelType.Backend,
endpoint,
self._engine_config.model_name,
self.served_model_name,
kv_cache_block_size=self._kv_block_size,
self._config.model_path,
self._config.served_model_name,
kv_cache_block_size=self._config.kv_block_size,
)
logger.info("Successfully registered LLM for discovery")
except Exception as e:
logger.error(f"Failed to register LLM for discovery: {e}")
raise
if self._remote_prefill:
runtime = dynamo_context["runtime"]
comp_ns, comp_name = TensorRTLLMPrefillWorker.dynamo_address() # type: ignore
self._prefill_client = (
await runtime.namespace(comp_ns)
.component(comp_name)
.endpoint("generate")
.client()
)
while len(self._prefill_client.instance_ids()) < self._min_prefill_workers:
logger.info(
f"Waiting for prefill workers to be ready.\n"
f" Current: {len(self._prefill_client.instance_ids())},"
f" Required: {self._min_prefill_workers}"
)
await asyncio.sleep(30)
if self._kv_metrics_publisher is not None:
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(
lambda _: logger.info("metrics publisher endpoint created")
)
logger.info("TensorRT-LLM Worker initialized")
async def create_metrics_publisher_endpoint(self):
component = dynamo_context["component"]
await self._kv_metrics_publisher.create_endpoint(component)
@on_shutdown
async def async_cleanup(self):
logger.info("Cleaning up TensorRT-LLM Worker")
await self.cleanup()
logger.info("TensorRT-LLM Worker cleanup completed")
@endpoint()
async def generate(self, request: TRTLLMWorkerRequest):
......
......@@ -20,8 +20,13 @@ Frontend:
router: round-robin
TensorRTLLMWorker:
# Path to disk model or HuggingFace model identifier to load
model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Name to serve the model under
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
engine_args: "configs/llm_api_config.yaml"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/engine_configs/agg_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
......
......@@ -20,9 +20,15 @@ Frontend:
router: kv
TensorRTLLMWorker:
engine_args: "configs/llm_api_config_router.yaml"
# Path to disk model or HuggingFace model identifier to load
model-path: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Name to serve the model under
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/engine_configs/agg_config.yaml"
router: kv
ServiceArgs:
workers: 1
resources:
gpu: 1
gpu: 1
\ No newline at end of file
......@@ -22,7 +22,12 @@ Frontend:
TensorRTLLMWorker:
served_model_name: "nvidia/DeepSeek-R1-FP4"
engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/DeepSeek-R1-FP4"
extra-engine-args: "configs/deepseek_r1/engine_configs/agg_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
......
......@@ -22,14 +22,13 @@ Frontend:
TensorRTLLMWorker:
served_model_name: "nvidia/DeepSeek-R1-FP4"
engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
llmapi-disaggregated-config: "configs/deepseek_r1/disagg_llm_api_config.yaml"
remote-prefill: true
# NOTE: When testing/benchmarking multiple prefill workers, you can set
# this number to the exact amount of prefill workers if you want Dynamo to
# wait until all the prefill workers are ready before marking the decode
# worker ready.
min-prefill-workers: 1
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/DeepSeek-R1-FP4"
extra-engine-args: "configs/deepseek_r1/engine_configs/decode_config.yaml"
enable-disagg: true
router: round-robin
ServiceArgs:
workers: 1
......@@ -37,8 +36,12 @@ TensorRTLLMWorker:
gpu: 4
TensorRTLLMPrefillWorker:
engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
llmapi-disaggregated-config: "configs/deepseek_r1/disagg_llm_api_config.yaml"
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/DeepSeek-R1-FP4"
extra-engine-args: "configs/deepseek_r1/engine_configs/prefill_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Example Configs for Context & Generation on GB200 nodes
# - Context on 1xGB200 (4xB00)
# - Generation on 1xGB200 (4xB200)
# NOTE: Fields like hostname, ports, urls, num_instances, etc. only used by trtllm-serve, not by dynamo
backend: pytorch
context_servers:
# Context/prefill processes many tokens at once, so for a large ISL, a large
# batch size may not be needed to saturate GPU utilization.
max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
# TP/EP/PP/DP
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
enable_attention_dp: true
kv_cache_config:
free_gpu_memory_fraction: 0.75
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8
generation_servers:
# Generation/decode processes one token per request at a time, so a larger
# batch size helps to saturate GPU utilization.
max_batch_size: 256
max_num_tokens: 256
# 8448 = 8192 ISL + 256 OSL
max_seq_len: 8448
# TP/EP/PP/DP
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
enable_attention_dp: false
kv_cache_config:
# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction: 0.85
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
# NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8
......@@ -12,12 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model_name: "nvidia/DeepSeek-R1-FP4"
backend: pytorch
# TP/EP/PP/DP
......
......@@ -12,32 +12,44 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: pytorch
# In the case of disaggregated deployment, this config will apply to each server
# and will be overwritten by the disaggregated config file
# TODO: figure out how to generate this from the service config or vice versa
model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model_path: null
tensor_parallel_size: 1
moe_expert_parallel_size: 1
# TP/EP/PP/DP
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
max_batch_size: 256
max_num_tokens: 256
# 8448 = 8192 ISL + 256 OSL
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.95
event_buffer_max_size: 1024
enable_block_reuse: true
# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction: 0.85
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false
use_cuda_graph: true
enable_iter_perf_stats: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
......@@ -12,32 +12,30 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: pytorch
# TP/EP/PP/DP
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
enable_attention_dp: true
# In the case of disaggregated deployment, this config will apply to each server
# and will be overwritten by the disaggregated config file
# TODO: figure out how to generate this from the service config or vice versa
model_name: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model_path: null
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_batch_size: 1
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.95
event_buffer_max_size: 1024
enable_block_reuse: true
# With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction: 0.75
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
enable_iter_perf_stats: true
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs
kv_cache_dtype: fp8
......@@ -18,7 +18,6 @@
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model_name: "nvidia/DeepSeek-R1-FP4"
backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
max_batch_size: 256
# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula:
# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1)
# This is a known issue in TensorRT-LLM and will be resolved in the next release.
max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
kv_cache_config:
free_gpu_memory_fraction: 0.85
# Enable the MTP(Multi-Token Prediction) in decode model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true
# Enable the MTP(Multi-Token Prediction) in the prefill model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
......@@ -21,7 +21,14 @@ Frontend:
TensorRTLLMWorker:
served_model_name: "nvidia/DeepSeek-R1-FP4"
engine_args: "configs/deepseek_r1/mtp/mtp_agg_llm_api_config.yaml"
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/DeepSeek-R1-FP4"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/agg_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
......
......@@ -21,19 +21,30 @@ Frontend:
TensorRTLLMWorker:
served_model_name: "nvidia/DeepSeek-R1-FP4"
engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
llmapi-disaggregated-config: "configs/deepseek_r1/mtp/mtp_disagg_llm_api_config.yaml"
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/DeepSeek-R1-FP4"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/decode_config.yaml"
router: round-robin
remote-prefill: true
min-prefill-workers: 1
enable-disagg: true
ServiceArgs:
workers: 1
resources:
gpu: 4
TensorRTLLMPrefillWorker:
engine_args: "configs/deepseek_r1/agg_llm_api_config.yaml"
llmapi-disaggregated-config: "configs/deepseek_r1/mtp/mtp_disagg_llm_api_config.yaml"
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/DeepSeek-R1-FP4"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/deepseek_r1/mtp/engine_configs/prefill_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
backend: pytorch
context_servers:
num_instances: 1
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: true
max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true
# Enable the MTP(Multi-Token Prediction) in the prefill model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
generation_servers:
num_instances: 1
tensor_parallel_size: 4
moe_expert_parallel_size: 4
enable_attention_dp: false
max_batch_size: 256
# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula:
# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1)
# This is a known issue in TensorRT-LLM and will be resolved in the next release.
max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
kv_cache_config:
free_gpu_memory_fraction: 0.85
# Enable the MTP(Multi-Token Prediction) in the decode model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment