Unverified Commit 52503032 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: TRTLLM 1.2.0rc4 (#4836)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent 00f8615e
...@@ -46,7 +46,7 @@ dependencies = [ ...@@ -46,7 +46,7 @@ dependencies = [
"pydantic>=2", "pydantic>=2",
"tabulate", "tabulate",
"types-tabulate", "types-tabulate",
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc2/rc3 (==4.56.0), SGLang 0.5.6 (==4.57.1) # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6 (==4.57.1)
"transformers>=4.56.0,<=4.57.1", "transformers>=4.56.0,<=4.57.1",
"pytest-mypy", "pytest-mypy",
] ]
......
...@@ -22,7 +22,6 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv( ...@@ -22,7 +22,6 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv(
import uvloop import uvloop
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
from tensorrt_llm.llmapi import ( from tensorrt_llm.llmapi import (
BuildConfig,
CapacitySchedulerPolicy, CapacitySchedulerPolicy,
DynamicBatchConfig, DynamicBatchConfig,
KvCacheConfig, KvCacheConfig,
...@@ -162,13 +161,6 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -162,13 +161,6 @@ async def init(runtime: DistributedRuntime, config: Config):
else: else:
gpus_per_node = config.gpus_per_node gpus_per_node = config.gpus_per_node
build_config = BuildConfig(
max_batch_size=config.max_batch_size,
max_num_tokens=config.max_num_tokens,
max_beam_width=config.max_beam_width,
max_seq_len=config.max_seq_len,
)
kv_cache_config = KvCacheConfig( kv_cache_config = KvCacheConfig(
free_gpu_memory_fraction=config.free_gpu_memory_fraction free_gpu_memory_fraction=config.free_gpu_memory_fraction
) )
...@@ -190,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -190,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"pipeline_parallel_size": config.pipeline_parallel_size, "pipeline_parallel_size": config.pipeline_parallel_size,
"moe_expert_parallel_size": config.expert_parallel_size, "moe_expert_parallel_size": config.expert_parallel_size,
"backend": Backend.PYTORCH, "backend": Backend.PYTORCH,
"build_config": build_config,
"kv_cache_config": kv_cache_config, "kv_cache_config": kv_cache_config,
"gpus_per_node": gpus_per_node, "gpus_per_node": gpus_per_node,
"max_num_tokens": config.max_num_tokens, "max_num_tokens": config.max_num_tokens,
......
...@@ -314,6 +314,7 @@ COPY components/ /opt/dynamo/components/ ...@@ -314,6 +314,7 @@ COPY components/ /opt/dynamo/components/
# Build dynamo wheels # Build dynamo wheels
ARG ENABLE_KVBM ARG ENABLE_KVBM
ARG USE_SCCACHE
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \ export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
...@@ -453,7 +454,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ ...@@ -453,7 +454,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \ sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \ bash /tmp/install_tensorrt.sh && \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
# TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best # TRTLLM 1.2.0rc5 has issues installing from pypi with uv, installing from direct wheel link works best
# explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason # explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \ if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \ TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
......
...@@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" ...@@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided. # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI # Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit. # variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2 DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e4c707845ff58fcc0b1d87afb4dd0e64885c780a" # 1.2.0rc5
TRTLLM_COMMIT="" TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TRTLLM_GIT_URL="" TRTLLM_GIT_URL=""
...@@ -98,7 +98,7 @@ TRTLLM_GIT_URL="" ...@@ -98,7 +98,7 @@ TRTLLM_GIT_URL=""
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/" DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc3" DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc5"
TENSORRTLLM_PIP_WHEEL="" TENSORRTLLM_PIP_WHEEL=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
......
...@@ -52,7 +52,7 @@ tensorboard==2.19.0 ...@@ -52,7 +52,7 @@ tensorboard==2.19.0
tensorboardX==2.6.2.2 tensorboardX==2.6.2.2
# Transformers version constraint for container builds # Transformers version constraint for container builds
# - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
# - TensorRT-LLM 1.2.0rc2/rc3: ==4.56.0 # - TensorRT-LLM 1.2.0rc5: ==4.56.0
# - SGLang 0.5.6: ==4.57.1 # - SGLang 0.5.6: ==4.57.1
# Using >=4.56.0 and <=4.57.1 to satisfy all frameworks # Using >=4.56.0 and <=4.57.1 to satisfy all frameworks
transformers>=4.56.0,<=4.57.1 transformers>=4.56.0,<=4.57.1
......
...@@ -96,15 +96,6 @@ To deploy `Llama-4-Maverick-17B-128E-Instruct` in disaggregated mode, you will n ...@@ -96,15 +96,6 @@ To deploy `Llama-4-Maverick-17B-128E-Instruct` in disaggregated mode, you will n
For high-performance multimodal inference, Dynamo supports pre-computed embeddings with an **Encode-Prefill-Decode (EPD)** flow using **NIXL (RDMA)** for zero-copy tensor transfer. For high-performance multimodal inference, Dynamo supports pre-computed embeddings with an **Encode-Prefill-Decode (EPD)** flow using **NIXL (RDMA)** for zero-copy tensor transfer.
### Enabling the Feature
This is an experimental feature that requires using a specific TensorRT-LLM commit.
To enable it build the dynamo container with the `--tensorrtllm-commit` flag:
```bash
./container/build.sh --framework trtllm --tensorrtllm-git-url https://github.com/NVIDIA/TensorRT-LLM.git --tensorrtllm-commit v1.2.0rc3
```
### Supported File Types ### Supported File Types
- `.pt` - PyTorch tensor files - `.pt` - PyTorch tensor files
......
...@@ -59,9 +59,9 @@ If you are using a **GPU**, the following GPU models and architectures are suppo ...@@ -59,9 +59,9 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
### Build Dependency ### Build Dependency
| **Build Dependency** | **Version as of Dynamo v0.7.0** | | **Build Dependency** | **Version as of Dynamo v0.7.0** |
| :------------------- | :------------------------------------------------------------------------------- | | :------------------- | :------------------------------ |
| **SGLang** | 0.5.3.post4 | | **SGLang** | 0.5.3.post4 |
| **TensorRT-LLM** | 1.2.0rc2 | | **TensorRT-LLM** | 1.2.0rc5 |
| **vLLM** | 0.11.0 | | **vLLM** | 0.11.0 |
| **NIXL** | 0.7.1 | | **NIXL** | 0.7.1 |
......
...@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" ...@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies] [project.optional-dependencies]
trtllm =[ trtllm =[
"uvloop", "uvloop",
"tensorrt-llm==1.2.0rc3", "tensorrt-llm==1.2.0rc5",
] ]
vllm = [ vllm = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment