Unverified Commit 9b8b9988 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: upgrade trtllm 1.2.0rc2 (#4405)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: default avatartanmayv25 <tanmay2592@gmail.com>
Co-authored-by: default avatarKyle McGill <kmcgill@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
Co-authored-by: default avatarTanmay Verma <tanmayv@nvidia.com>
parent 6d69e8c7
...@@ -182,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -182,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
"pipeline_parallel_size": config.pipeline_parallel_size, "pipeline_parallel_size": config.pipeline_parallel_size,
"moe_expert_parallel_size": config.expert_parallel_size, "moe_expert_parallel_size": config.expert_parallel_size,
"backend": Backend.PYTORCH, "backend": Backend.PYTORCH,
"skip_tokenizer_init": True,
"build_config": build_config, "build_config": build_config,
"kv_cache_config": kv_cache_config, "kv_cache_config": kv_cache_config,
"gpus_per_node": gpus_per_node, "gpus_per_node": gpus_per_node,
...@@ -241,12 +240,10 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -241,12 +240,10 @@ async def init(runtime: DistributedRuntime, config: Config):
# Populate default sampling params from the model # Populate default sampling params from the model
tokenizer = tokenizer_factory(arg_map["model"]) tokenizer = tokenizer_factory(arg_map["model"])
default_sampling_params = SamplingParams() default_sampling_params = SamplingParams()
default_sampling_params._setup(tokenizer)
default_sampling_params.stop = None
# Enable perf metrics so prompt_tokens_details can be returned # Enable perf metrics so prompt_tokens_details can be returned
if hasattr(default_sampling_params, "return_perf_metrics"): if hasattr(default_sampling_params, "return_perf_metrics"):
default_sampling_params.return_perf_metrics = True default_sampling_params.return_perf_metrics = True
model_input = ModelInput.Tokens model_input = ModelInput.Tokens
# Set model type based on disaggregation mode for unified frontend support # Set model type based on disaggregation mode for unified frontend support
......
...@@ -41,11 +41,11 @@ class Config: ...@@ -41,11 +41,11 @@ class Config:
self.kv_block_size: int = 32 self.kv_block_size: int = 32
self.migration_limit: int = 0 self.migration_limit: int = 0
self.gpus_per_node: Optional[int] = None self.gpus_per_node: Optional[int] = None
self.max_batch_size: int = BuildConfig.max_batch_size self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
self.max_num_tokens: int = BuildConfig.max_num_tokens self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
self.max_seq_len: int = BuildConfig.max_seq_len self.max_seq_len: int = BuildConfig.model_fields["max_seq_len"].default
self.max_beam_width: int = BuildConfig.max_beam_width self.max_beam_width: int = BuildConfig.model_fields["max_beam_width"].default
self.free_gpu_memory_fraction: Optional[float] = None self.free_gpu_memory_fraction: float = 0.9
self.extra_engine_args: str = "" self.extra_engine_args: str = ""
self.override_engine_args: str = "" self.override_engine_args: str = ""
self.publish_events_and_metrics: bool = False self.publish_events_and_metrics: bool = False
...@@ -176,26 +176,26 @@ def cmd_line_args(): ...@@ -176,26 +176,26 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--max-batch-size", "--max-batch-size",
type=int, type=int,
default=BuildConfig.max_batch_size, default=BuildConfig.model_fields["max_batch_size"].default,
help="Maximum number of requests that the engine can schedule.", help="Maximum number of requests that the engine can schedule.",
) )
parser.add_argument( parser.add_argument(
"--max-num-tokens", "--max-num-tokens",
type=int, type=int,
default=BuildConfig.max_num_tokens, default=BuildConfig.model_fields["max_num_tokens"].default,
help="Maximum number of batched input tokens after padding is removed in each batch.", help="Maximum number of batched input tokens after padding is removed in each batch.",
) )
parser.add_argument( parser.add_argument(
"--max-seq-len", "--max-seq-len",
type=int, type=int,
default=BuildConfig.max_seq_len, default=BuildConfig.model_fields["max_seq_len"].default,
help="Maximum total length of one request, including prompt and outputs. " help="Maximum total length of one request, including prompt and outputs. "
"If unspecified, the value is deduced from the model config.", "If unspecified, the value is deduced from the model config.",
) )
parser.add_argument( parser.add_argument(
"--max-beam-width", "--max-beam-width",
type=int, type=int,
default=BuildConfig.max_beam_width, default=BuildConfig.model_fields["max_beam_width"].default,
help="Maximum number of beams for beam search decoding.", help="Maximum number of beams for beam search decoding.",
) )
parser.add_argument( parser.add_argument(
......
...@@ -2,18 +2,18 @@ ...@@ -2,18 +2,18 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04"
ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch" ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG PYTORCH_BASE_IMAGE_TAG="25.06-py3" ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3"
ARG ENABLE_KVBM=false ARG ENABLE_KVBM=false
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04" ARG RUNTIME_IMAGE_TAG="25.10-cuda13.0-runtime-ubuntu24.04"
# TensorRT-LLM specific configuration # TensorRT-LLM specific configuration
ARG HAS_TRTLLM_CONTEXT=0 ARG HAS_TRTLLM_CONTEXT=0
ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm" ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" ARG TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
ARG GITHUB_TRTLLM_COMMIT ARG GITHUB_TRTLLM_COMMIT
# Define general architecture ARGs for supporting both x86 and aarch64 builds. # Define general architecture ARGs for supporting both x86 and aarch64 builds.
...@@ -72,6 +72,7 @@ RUN apt-get update && \ ...@@ -72,6 +72,7 @@ RUN apt-get update && \
git \ git \
git-lfs \ git-lfs \
ca-certificates && \ ca-certificates && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Copy uv # Copy uv
...@@ -82,16 +83,12 @@ RUN mkdir -p /opt/dynamo/venv && \ ...@@ -82,16 +83,12 @@ RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION uv venv /opt/dynamo/venv --python $PYTHON_VERSION
# Copy pytorch installation from NGC PyTorch # Copy pytorch installation from NGC PyTorch
ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6 ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10
ARG TORCHVISION_VER=0.22.0a0+95f10a4e ARG TORCH_TENSORRT_VER=2.9.0a0
ARG SETUPTOOLS_VER=78.1.1 ARG TORCHVISION_VER=0.24.0a0+094e7af5
ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal
ARG JINJA2_VER=3.1.6 ARG JINJA2_VER=3.1.6
ARG NETWORKX_VER=3.5
ARG SYMPY_VER=1.14.0 ARG SYMPY_VER=1.14.0
ARG PACKAGING_VER=23.2 ARG FLASH_ATTN_VER=2.7.4.post1+25.10
ARG FLASH_ATTN_VER=2.7.4.post1
ARG MPMATH_VER=1.3.0
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
...@@ -107,8 +104,8 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sy ...@@ -107,8 +104,8 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sy
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info
# Install TensorRT-LLM and related dependencies # Install TensorRT-LLM and related dependencies
ARG HAS_TRTLLM_CONTEXT ARG HAS_TRTLLM_CONTEXT
...@@ -120,8 +117,7 @@ ARG GITHUB_TRTLLM_COMMIT ...@@ -120,8 +117,7 @@ ARG GITHUB_TRTLLM_COMMIT
COPY --from=trtllm_wheel /*.whl /trtllm_wheel/ COPY --from=trtllm_wheel /*.whl /trtllm_wheel/
COPY --from=trtllm_wheel /*.txt /trtllm_wheel/ COPY --from=trtllm_wheel /*.txt /trtllm_wheel/
# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6. RUN uv pip install --no-cache "cuda-python==13.0.2"
RUN uv pip install "cuda-python>=12,<13"
# Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
# because there might be mismatched versions of TensorRT between the NGC PyTorch # because there might be mismatched versions of TensorRT between the NGC PyTorch
...@@ -141,7 +137,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ ...@@ -141,7 +137,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
# Install from local wheel directory in build context # Install from local wheel directory in build context
WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \ WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
if [ -n "$WHEEL_FILE" ]; then \ if [ -n "$WHEEL_FILE" ]; then \
uv pip install "$WHEEL_FILE"; \ uv pip install --no-cache "$WHEEL_FILE"; \
else \ else \
echo "No wheel file found in /trtllm_wheel directory."; \ echo "No wheel file found in /trtllm_wheel directory."; \
exit 1; \ exit 1; \
...@@ -155,7 +151,10 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ ...@@ -155,7 +151,10 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \ sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \ bash /tmp/install_tensorrt.sh && \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \ # TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
# explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
export TENSORRTLLM_PIP_WHEEL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-1.2.0rc2-cp312-cp312-linux_${ARCH_ALT}.whl"; \
uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" triton==3.5.0; \
fi fi
################################################## ##################################################
...@@ -190,12 +189,27 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv ...@@ -190,12 +189,27 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# workaround for pickle lib issue
ENV OMPI_MCA_coll_ucc_enable=0
# Use UCX KVCACHE by default
ENV TRTLLM_USE_UCX_KVCACHE=1
ARG DYNAMO_COMMIT_SHA ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
# Install Python, build-essential and python3-dev as apt dependencies # Install Python, build-essential and python3-dev as apt dependencies
RUN apt-get update && \ RUN if [ ${ARCH_ALT} = "x86_64" ]; then \
ARCH_FOR_GPG=${ARCH_ALT}; \
else \
ARCH_FOR_GPG="sbsa"; \
fi && \
curl -fsSL \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG}/cuda-archive-keyring.gpg \
-o /usr/share/keyrings/cuda-archive-keyring.gpg &&\
echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG} /" \
| tee /etc/apt/sources.list.d/cuda.repo.list > /dev/null &&\
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Build tools # Build tools
build-essential \ build-essential \
...@@ -209,7 +223,8 @@ RUN apt-get update && \ ...@@ -209,7 +223,8 @@ RUN apt-get update && \
# jq for polling various endpoints and health checks # jq for polling various endpoints and health checks
jq \ jq \
# CUDA/ML libraries # CUDA/ML libraries
libcudnn9-cuda-12 \ libcudnn9-cuda-13 \
libnvshmem3-cuda-13 \
# Network and communication libraries # Network and communication libraries
libzmq3-dev \ libzmq3-dev \
# RDMA/UCX libraries required to find RDMA devices # RDMA/UCX libraries required to find RDMA devices
...@@ -228,6 +243,8 @@ RUN apt-get update && \ ...@@ -228,6 +243,8 @@ RUN apt-get update && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
ENV LD_LIBRARY_PATH="/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:${LD_LIBRARY_PATH}"
# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image # Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image
COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++ COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
...@@ -238,6 +255,16 @@ COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm ...@@ -238,6 +255,16 @@ COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/ COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/
COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/ COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/
COPY --from=pytorch_base /usr/local/cuda/bin/cuobjdump /usr/local/cuda/bin/cuobjdump
COPY --from=pytorch_base /usr/local/cuda/bin/nvdisasm /usr/local/cuda/bin/nvdisasm
ENV CUDA_HOME=/usr/local/cuda \
TRITON_CUPTI_PATH=/usr/local/cuda/include \
TRITON_CUDACRT_PATH=/usr/local/cuda/include \
TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TRITON_CUDART_PATH=/usr/local/cuda/include
# Copy nats and etcd from dynamo_base image # Copy nats and etcd from dynamo_base image
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
...@@ -255,8 +282,6 @@ COPY --from=pytorch_base /opt/hpcx /opt/hpcx ...@@ -255,8 +282,6 @@ COPY --from=pytorch_base /opt/hpcx /opt/hpcx
# This is needed to make libucc.so visible so pytorch can use it. # This is needed to make libucc.so visible so pytorch can use it.
ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}" ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}"
# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them
# pytorch-triton is copied after trtllm installation.
COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
# Copy uv to system /bin # Copy uv to system /bin
...@@ -274,6 +299,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \ ...@@ -274,6 +299,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
&& chown -R dynamo: /workspace /home/dynamo /opt/dynamo \ && chown -R dynamo: /workspace /home/dynamo /opt/dynamo \
&& chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo && chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo
# Switch to dynamo user # Switch to dynamo user
USER dynamo USER dynamo
ENV HOME=/home/dynamo ENV HOME=/home/dynamo
...@@ -299,17 +325,18 @@ ENV OPAL_PREFIX=/opt/hpcx/ompi ...@@ -299,17 +325,18 @@ ENV OPAL_PREFIX=/opt/hpcx/ompi
COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV} COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH=/opt/dynamo/venv/lib/python3.12/site-packages/torch/lib:/opt/dynamo/venv/lib/python3.12/site-packages/torch_tensorrt/lib:${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
# Install dynamo, NIXL, and dynamo-specific dependencies # Install dynamo, NIXL, and dynamo-specific dependencies
COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/ COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/
COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \ RUN uv pip install \
--no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \ && if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \ uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \ fi \
&& cd /opt/dynamo/benchmarks \ && cd /opt/dynamo/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache . \ && UV_GIT_LFS=1 uv pip install --no-cache . \
...@@ -321,8 +348,11 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi ...@@ -321,8 +348,11 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \ --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
UV_GIT_LFS=1 uv pip install \ UV_GIT_LFS=1 uv pip install \
--no-cache \ --no-cache \
--index-strategy unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/cu130 \
--requirement /tmp/requirements.txt \ --requirement /tmp/requirements.txt \
--requirement /tmp/requirements.test.txt --requirement /tmp/requirements.test.txt \
cupy-cuda13x
# Copy tests, benchmarks, deploy and components for CI with correct ownership # Copy tests, benchmarks, deploy and components for CI with correct ownership
COPY --chown=dynamo: tests /workspace/tests COPY --chown=dynamo: tests /workspace/tests
...@@ -346,7 +376,6 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \ ...@@ -346,7 +376,6 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
USER dynamo USER dynamo
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD [] CMD []
...@@ -397,6 +426,7 @@ RUN apt-get update -y && \ ...@@ -397,6 +426,7 @@ RUN apt-get update -y && \
clang \ clang \
libclang-dev \ libclang-dev \
protobuf-compiler && \ protobuf-compiler && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Set workspace directory variable # Set workspace directory variable
...@@ -412,10 +442,10 @@ COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup ...@@ -412,10 +442,10 @@ COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
# Install maturin, for maturin develop # Install maturin, for maturin develop
RUN uv pip install maturin[patchelf] RUN uv pip install --no-cache maturin[patchelf]
# Editable install of dynamo # Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/ COPY pyproject.toml README.md hatch_build.py /workspace/
RUN uv pip install --no-deps -e . RUN uv pip install --no-cache --no-deps -e .
CMD [] CMD []
...@@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")") ...@@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
# Base Images # Base Images
TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
TRTLLM_BASE_IMAGE_TAG=25.06-py3 TRTLLM_BASE_IMAGE_TAG=25.10-py3
# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch, # Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source. # we need to build the TensorRT-LLM wheel from source.
...@@ -89,19 +89,18 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" ...@@ -89,19 +89,18 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided. # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI # Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit. # variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492" DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
TRTLLM_COMMIT="" TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TRTLLM_GIT_URL="" TRTLLM_GIT_URL=""
# TensorRT-LLM PyPI index URL # TensorRT-LLM PyPI index URL
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5" DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc2"
TENSORRTLLM_PIP_WHEEL="" TENSORRTLLM_PIP_WHEEL=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# FIXME: NCCL will hang with 25.03, so use 25.01 for now # FIXME: NCCL will hang with 25.03, so use 25.01 for now
# Please check https://github.com/ai-dynamo/dynamo/pull/1065 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
......
...@@ -23,11 +23,11 @@ set -ex ...@@ -23,11 +23,11 @@ set -ex
GITHUB_URL="https://github.com" GITHUB_URL="https://github.com"
UCX_VERSION="v1.18.1" UCX_VERSION="v1.19.1"
UCX_INSTALL_PATH="/usr/local/ucx/" UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda" CUDA_PATH="/usr/local/cuda"
NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3" NIXL_COMMIT="97c9b5b48e2ed3f1f2539c461c4971a7db8b1197"
UCX_REPO="https://github.com/openucx/ucx.git" UCX_REPO="https://github.com/openucx/ucx.git"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git" NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
......
...@@ -49,7 +49,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" ...@@ -49,7 +49,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies] [project.optional-dependencies]
trtllm =[ trtllm =[
"uvloop", "uvloop",
"tensorrt-llm==1.1.0rc5", "tensorrt-llm==1.2.0rc2",
] ]
vllm = [ vllm = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment