Unverified Commit 52503032 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: TRTLLM 1.2.0rc4 (#4836)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent 00f8615e
......@@ -46,7 +46,7 @@ dependencies = [
"pydantic>=2",
"tabulate",
"types-tabulate",
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc2/rc3 (==4.56.0), SGLang 0.5.6 (==4.57.1)
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6 (==4.57.1)
"transformers>=4.56.0,<=4.57.1",
"pytest-mypy",
]
......
......@@ -22,7 +22,6 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv(
import uvloop
from prometheus_client import REGISTRY
from tensorrt_llm.llmapi import (
BuildConfig,
CapacitySchedulerPolicy,
DynamicBatchConfig,
KvCacheConfig,
......@@ -162,13 +161,6 @@ async def init(runtime: DistributedRuntime, config: Config):
else:
gpus_per_node = config.gpus_per_node
build_config = BuildConfig(
max_batch_size=config.max_batch_size,
max_num_tokens=config.max_num_tokens,
max_beam_width=config.max_beam_width,
max_seq_len=config.max_seq_len,
)
kv_cache_config = KvCacheConfig(
free_gpu_memory_fraction=config.free_gpu_memory_fraction
)
......@@ -190,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"pipeline_parallel_size": config.pipeline_parallel_size,
"moe_expert_parallel_size": config.expert_parallel_size,
"backend": Backend.PYTORCH,
"build_config": build_config,
"kv_cache_config": kv_cache_config,
"gpus_per_node": gpus_per_node,
"max_num_tokens": config.max_num_tokens,
......
......@@ -314,6 +314,7 @@ COPY components/ /opt/dynamo/components/
# Build dynamo wheels
ARG ENABLE_KVBM
ARG USE_SCCACHE
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
......@@ -453,7 +454,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
# TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
# TRTLLM 1.2.0rc5 has issues installing from pypi with uv, installing from direct wheel link works best
# explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
......
......@@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e4c707845ff58fcc0b1d87afb4dd0e64885c780a" # 1.2.0rc5
TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TRTLLM_GIT_URL=""
......@@ -98,7 +98,7 @@ TRTLLM_GIT_URL=""
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc3"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc5"
TENSORRTLLM_PIP_WHEEL=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
......
......@@ -52,7 +52,7 @@ tensorboard==2.19.0
tensorboardX==2.6.2.2
# Transformers version constraint for container builds
# - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
# - TensorRT-LLM 1.2.0rc2/rc3: ==4.56.0
# - TensorRT-LLM 1.2.0rc5: ==4.56.0
# - SGLang 0.5.6: ==4.57.1
# Using >=4.56.0 and <=4.57.1 to satisfy all frameworks
transformers>=4.56.0,<=4.57.1
......
......@@ -96,15 +96,6 @@ To deploy `Llama-4-Maverick-17B-128E-Instruct` in disaggregated mode, you will n
For high-performance multimodal inference, Dynamo supports pre-computed embeddings with an **Encode-Prefill-Decode (EPD)** flow using **NIXL (RDMA)** for zero-copy tensor transfer.
### Enabling the Feature
This is an experimental feature that requires using a specific TensorRT-LLM commit.
To enable it build the dynamo container with the `--tensorrtllm-commit` flag:
```bash
./container/build.sh --framework trtllm --tensorrtllm-git-url https://github.com/NVIDIA/TensorRT-LLM.git --tensorrtllm-commit v1.2.0rc3
```
### Supported File Types
- `.pt` - PyTorch tensor files
......
......@@ -58,12 +58,12 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
### Build Dependency
| **Build Dependency** | **Version as of Dynamo v0.7.0** |
| :------------------- | :------------------------------------------------------------------------------- |
| **SGLang** | 0.5.3.post4 |
| **TensorRT-LLM** | 1.2.0rc2 |
| **vLLM** | 0.11.0 |
| **NIXL** | 0.7.1 |
| **Build Dependency** | **Version as of Dynamo v0.7.0** |
| :------------------- | :------------------------------ |
| **SGLang** | 0.5.3.post4 |
| **TensorRT-LLM** | 1.2.0rc5 |
| **vLLM** | 0.11.0 |
| **NIXL** | 0.7.1 |
> [!Important]
......
......@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies]
trtllm =[
"uvloop",
"tensorrt-llm==1.2.0rc3",
"tensorrt-llm==1.2.0rc5",
]
vllm = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment