Unverified Commit 9b8b9988 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: upgrade trtllm 1.2.0rc2 (#4405)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: default avatartanmayv25 <tanmay2592@gmail.com>
Co-authored-by: default avatarKyle McGill <kmcgill@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
Co-authored-by: default avatarTanmay Verma <tanmayv@nvidia.com>
parent 6d69e8c7
......@@ -182,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"pipeline_parallel_size": config.pipeline_parallel_size,
"moe_expert_parallel_size": config.expert_parallel_size,
"backend": Backend.PYTORCH,
"skip_tokenizer_init": True,
"build_config": build_config,
"kv_cache_config": kv_cache_config,
"gpus_per_node": gpus_per_node,
......@@ -241,12 +240,10 @@ async def init(runtime: DistributedRuntime, config: Config):
# Populate default sampling params from the model
tokenizer = tokenizer_factory(arg_map["model"])
default_sampling_params = SamplingParams()
default_sampling_params._setup(tokenizer)
default_sampling_params.stop = None
# Enable perf metrics so prompt_tokens_details can be returned
if hasattr(default_sampling_params, "return_perf_metrics"):
default_sampling_params.return_perf_metrics = True
model_input = ModelInput.Tokens
# Set model type based on disaggregation mode for unified frontend support
......
......@@ -41,11 +41,11 @@ class Config:
self.kv_block_size: int = 32
self.migration_limit: int = 0
self.gpus_per_node: Optional[int] = None
self.max_batch_size: int = BuildConfig.max_batch_size
self.max_num_tokens: int = BuildConfig.max_num_tokens
self.max_seq_len: int = BuildConfig.max_seq_len
self.max_beam_width: int = BuildConfig.max_beam_width
self.free_gpu_memory_fraction: Optional[float] = None
self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
self.max_seq_len: int = BuildConfig.model_fields["max_seq_len"].default
self.max_beam_width: int = BuildConfig.model_fields["max_beam_width"].default
self.free_gpu_memory_fraction: float = 0.9
self.extra_engine_args: str = ""
self.override_engine_args: str = ""
self.publish_events_and_metrics: bool = False
......@@ -176,26 +176,26 @@ def cmd_line_args():
parser.add_argument(
"--max-batch-size",
type=int,
default=BuildConfig.max_batch_size,
default=BuildConfig.model_fields["max_batch_size"].default,
help="Maximum number of requests that the engine can schedule.",
)
parser.add_argument(
"--max-num-tokens",
type=int,
default=BuildConfig.max_num_tokens,
default=BuildConfig.model_fields["max_num_tokens"].default,
help="Maximum number of batched input tokens after padding is removed in each batch.",
)
parser.add_argument(
"--max-seq-len",
type=int,
default=BuildConfig.max_seq_len,
default=BuildConfig.model_fields["max_seq_len"].default,
help="Maximum total length of one request, including prompt and outputs. "
"If unspecified, the value is deduced from the model config.",
)
parser.add_argument(
"--max-beam-width",
type=int,
default=BuildConfig.max_beam_width,
default=BuildConfig.model_fields["max_beam_width"].default,
help="Maximum number of beams for beam search decoding.",
)
parser.add_argument(
......
......@@ -2,18 +2,18 @@
# SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04"
ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG PYTORCH_BASE_IMAGE_TAG="25.06-py3"
ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3"
ARG ENABLE_KVBM=false
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG RUNTIME_IMAGE_TAG="25.10-cuda13.0-runtime-ubuntu24.04"
# TensorRT-LLM specific configuration
ARG HAS_TRTLLM_CONTEXT=0
ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
ARG TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
ARG GITHUB_TRTLLM_COMMIT
# Define general architecture ARGs for supporting both x86 and aarch64 builds.
......@@ -72,6 +72,7 @@ RUN apt-get update && \
git \
git-lfs \
ca-certificates && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Copy uv
......@@ -82,16 +83,12 @@ RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION
# Copy pytorch installation from NGC PyTorch
ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6
ARG TORCHVISION_VER=0.22.0a0+95f10a4e
ARG SETUPTOOLS_VER=78.1.1
ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal
ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10
ARG TORCH_TENSORRT_VER=2.9.0a0
ARG TORCHVISION_VER=0.24.0a0+094e7af5
ARG JINJA2_VER=3.1.6
ARG NETWORKX_VER=3.5
ARG SYMPY_VER=1.14.0
ARG PACKAGING_VER=23.2
ARG FLASH_ATTN_VER=2.7.4.post1
ARG MPMATH_VER=1.3.0
ARG FLASH_ATTN_VER=2.7.4.post1+25.10
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
......@@ -107,8 +104,8 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sy
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info
# Install TensorRT-LLM and related dependencies
ARG HAS_TRTLLM_CONTEXT
......@@ -120,8 +117,7 @@ ARG GITHUB_TRTLLM_COMMIT
COPY --from=trtllm_wheel /*.whl /trtllm_wheel/
COPY --from=trtllm_wheel /*.txt /trtllm_wheel/
# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6.
RUN uv pip install "cuda-python>=12,<13"
RUN uv pip install --no-cache "cuda-python==13.0.2"
# Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
# because there might be mismatched versions of TensorRT between the NGC PyTorch
......@@ -141,7 +137,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
# Install from local wheel directory in build context
WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
if [ -n "$WHEEL_FILE" ]; then \
uv pip install "$WHEEL_FILE"; \
uv pip install --no-cache "$WHEEL_FILE"; \
else \
echo "No wheel file found in /trtllm_wheel directory."; \
exit 1; \
......@@ -155,7 +151,10 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \
# TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
# explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
export TENSORRTLLM_PIP_WHEEL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-1.2.0rc2-cp312-cp312-linux_${ARCH_ALT}.whl"; \
uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" triton==3.5.0; \
fi
##################################################
......@@ -190,12 +189,27 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# workaround for pickle lib issue
ENV OMPI_MCA_coll_ucc_enable=0
# Use UCX KVCACHE by default
ENV TRTLLM_USE_UCX_KVCACHE=1
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
# Install Python, build-essential and python3-dev as apt dependencies
RUN apt-get update && \
RUN if [ ${ARCH_ALT} = "x86_64" ]; then \
ARCH_FOR_GPG=${ARCH_ALT}; \
else \
ARCH_FOR_GPG="sbsa"; \
fi && \
curl -fsSL \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG}/cuda-archive-keyring.gpg \
-o /usr/share/keyrings/cuda-archive-keyring.gpg &&\
echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG} /" \
| tee /etc/apt/sources.list.d/cuda.repo.list > /dev/null &&\
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Build tools
build-essential \
......@@ -209,7 +223,8 @@ RUN apt-get update && \
# jq for polling various endpoints and health checks
jq \
# CUDA/ML libraries
libcudnn9-cuda-12 \
libcudnn9-cuda-13 \
libnvshmem3-cuda-13 \
# Network and communication libraries
libzmq3-dev \
# RDMA/UCX libraries required to find RDMA devices
......@@ -228,6 +243,8 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
ENV LD_LIBRARY_PATH="/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:${LD_LIBRARY_PATH}"
# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image
COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
......@@ -238,6 +255,16 @@ COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/
COPY --from=pytorch_base /usr/local/cuda/bin/cuobjdump /usr/local/cuda/bin/cuobjdump
COPY --from=pytorch_base /usr/local/cuda/bin/nvdisasm /usr/local/cuda/bin/nvdisasm
ENV CUDA_HOME=/usr/local/cuda \
TRITON_CUPTI_PATH=/usr/local/cuda/include \
TRITON_CUDACRT_PATH=/usr/local/cuda/include \
TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TRITON_CUDART_PATH=/usr/local/cuda/include
# Copy nats and etcd from dynamo_base image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
......@@ -255,8 +282,6 @@ COPY --from=pytorch_base /opt/hpcx /opt/hpcx
# This is needed to make libucc.so visible so pytorch can use it.
ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}"
# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them
# pytorch-triton is copied after trtllm installation.
COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
# Copy uv to system /bin
......@@ -274,6 +299,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
&& chown -R dynamo: /workspace /home/dynamo /opt/dynamo \
&& chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo
# Switch to dynamo user
USER dynamo
ENV HOME=/home/dynamo
......@@ -299,17 +325,18 @@ ENV OPAL_PREFIX=/opt/hpcx/ompi
COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
ENV LD_LIBRARY_PATH=/opt/dynamo/venv/lib/python3.12/site-packages/torch/lib:/opt/dynamo/venv/lib/python3.12/site-packages/torch_tensorrt/lib:${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
# Install dynamo, NIXL, and dynamo-specific dependencies
COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/
COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
--no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /opt/dynamo/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache . \
......@@ -321,8 +348,11 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
UV_GIT_LFS=1 uv pip install \
--no-cache \
--index-strategy unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/cu130 \
--requirement /tmp/requirements.txt \
--requirement /tmp/requirements.test.txt
--requirement /tmp/requirements.test.txt \
cupy-cuda13x
# Copy tests, benchmarks, deploy and components for CI with correct ownership
COPY --chown=dynamo: tests /workspace/tests
......@@ -346,7 +376,6 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
USER dynamo
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -374,29 +403,30 @@ USER root
# Install utilities as root
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
# Install utilities
nvtop \
wget \
tmux \
vim \
git \
iproute2 \
rsync \
zip \
unzip \
htop \
# Build Dependencies
autoconf \
automake \
cmake \
libtool \
meson \
net-tools \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler && \
# Install utilities
nvtop \
wget \
tmux \
vim \
git \
iproute2 \
rsync \
zip \
unzip \
htop \
# Build Dependencies
autoconf \
automake \
cmake \
libtool \
meson \
net-tools \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set workspace directory variable
......@@ -412,10 +442,10 @@ COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
# Install maturin, for maturin develop
RUN uv pip install maturin[patchelf]
RUN uv pip install --no-cache maturin[patchelf]
# Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/
RUN uv pip install --no-deps -e .
RUN uv pip install --no-cache --no-deps -e .
CMD []
......@@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
# Base Images
TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
TRTLLM_BASE_IMAGE_TAG=25.06-py3
TRTLLM_BASE_IMAGE_TAG=25.10-py3
# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source.
......@@ -89,19 +89,18 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492"
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TRTLLM_GIT_URL=""
# TensorRT-LLM PyPI index URL
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc2"
TENSORRTLLM_PIP_WHEEL=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# FIXME: NCCL will hang with 25.03, so use 25.01 for now
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
......
......@@ -23,11 +23,11 @@ set -ex
GITHUB_URL="https://github.com"
UCX_VERSION="v1.18.1"
UCX_VERSION="v1.19.1"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"
NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
NIXL_COMMIT="97c9b5b48e2ed3f1f2539c461c4971a7db8b1197"
UCX_REPO="https://github.com/openucx/ucx.git"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
......
......@@ -49,7 +49,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies]
trtllm =[
"uvloop",
"tensorrt-llm==1.1.0rc5",
"tensorrt-llm==1.2.0rc2",
]
vllm = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment