"vscode:/vscode.git/clone" did not exist on "333ee9834f36e6e36fd80d3edad3396564a3d0d7"
Unverified Commit ba711cc1 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

chore: Upgrade to Tensorrt-LLM 1.3.0rc1 (#5700)


Co-authored-by: default avatarPavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>
parent 9e2a2cc9
...@@ -49,7 +49,7 @@ dependencies = [ ...@@ -49,7 +49,7 @@ dependencies = [
"pydantic>=2", "pydantic>=2",
"tabulate", "tabulate",
"types-tabulate", "types-tabulate",
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc6.post2 (==4.57.1), SGLang 0.5.8 (==4.57.1) # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc1 (==4.57.1), SGLang 0.5.8 (==4.57.1)
"transformers>=4.56.0", "transformers>=4.56.0",
"pytest-mypy", "pytest-mypy",
] ]
......
...@@ -37,6 +37,16 @@ from dynamo.llm import KvEventPublisher, WorkerMetricsPublisher ...@@ -37,6 +37,16 @@ from dynamo.llm import KvEventPublisher, WorkerMetricsPublisher
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
# Use non-blocking RPC calls; control overhead with backoff sleeps.
_STATS_TIMEOUT_SEC = 0.01
_KV_EVENTS_TIMEOUT_SEC = 0.0
_PUBLISH_MIN_SLEEP_SEC = 0.01
_PUBLISH_MAX_SLEEP_SEC = 0.1
_PUBLISH_BACKOFF_FACTOR = 2.0
_KV_EVENTS_MIN_SLEEP_SEC = 0.005
_KV_EVENTS_MAX_SLEEP_SEC = 0.02
_KV_EVENTS_BACKOFF_FACTOR = 1.5
def _to_signed_i64(value: int | None) -> int | None: def _to_signed_i64(value: int | None) -> int | None:
"""Convert a Python int to signed 64-bit range by two's complement.""" """Convert a Python int to signed 64-bit range by two's complement."""
...@@ -381,6 +391,32 @@ class Publisher: ...@@ -381,6 +391,32 @@ class Publisher:
name="publish_kv_cache_events_thread", name="publish_kv_cache_events_thread",
) )
async def _polling_loop(
self,
fetch_fn,
handler_fn,
min_sleep: float,
max_sleep: float,
backoff_factor: float,
):
sleep_s = min_sleep
while not self._stop_event.is_set():
had_data = False
try:
async for item in fetch_fn():
had_data = True
handler_fn(item)
except (asyncio.TimeoutError, TimeoutError, asyncio.QueueEmpty):
pass
except Exception as e:
logging.warning(f"Publisher polling loop error: {e}", exc_info=True)
if not had_data:
await asyncio.sleep(sleep_s)
sleep_s = min(max_sleep, sleep_s * backoff_factor)
else:
sleep_s = min_sleep
async def _publish_stats_task(self): async def _publish_stats_task(self):
""" """
Publish stats to the metrics publisher. Publish stats to the metrics publisher.
...@@ -393,15 +429,19 @@ class Publisher: ...@@ -393,15 +429,19 @@ class Publisher:
logging.error("KV metrics publisher not initialized!") logging.error("KV metrics publisher not initialized!")
return False return False
stats = self.engine.llm.get_stats_async(timeout=5) def handle_stat(stat):
async for stat in stats:
kv_active_blocks = stat["kvCacheStats"]["usedNumBlocks"] kv_active_blocks = stat["kvCacheStats"]["usedNumBlocks"]
logging.debug(f"Publishing stats: kv_active_blocks: {kv_active_blocks}") logging.debug(f"Publishing stats: kv_active_blocks: {kv_active_blocks}")
# TRT-LLM doesn't use data parallelism currently (dp_rank=None) # TRT-LLM doesn't use data parallelism currently (dp_rank=None)
self.metrics_publisher.publish(None, kv_active_blocks) self.metrics_publisher.publish(None, kv_active_blocks)
await self._polling_loop(
lambda: self.engine.llm.get_stats_async(timeout=_STATS_TIMEOUT_SEC),
handle_stat,
_PUBLISH_MIN_SLEEP_SEC,
_PUBLISH_MAX_SLEEP_SEC,
_PUBLISH_BACKOFF_FACTOR,
)
return True return True
async def _publish_kv_cache_events_task(self): async def _publish_kv_cache_events_task(self):
...@@ -418,12 +458,22 @@ class Publisher: ...@@ -418,12 +458,22 @@ class Publisher:
logging.error("No KV event publisher initialized (neither NATS nor ZMQ)!") logging.error("No KV event publisher initialized (neither NATS nor ZMQ)!")
return return
events = self.engine.llm.get_kv_cache_events_async(timeout=5) await self._polling_loop(
async for event in events: lambda: self.engine.llm.get_kv_cache_events_async(
timeout=_KV_EVENTS_TIMEOUT_SEC
),
self._handle_kv_event,
_KV_EVENTS_MIN_SLEEP_SEC,
_KV_EVENTS_MAX_SLEEP_SEC,
_KV_EVENTS_BACKOFF_FACTOR,
)
return True
def _handle_kv_event(self, event):
logging.debug(f"KV cache event received: {event}") logging.debug(f"KV cache event received: {event}")
# drop the events that is not emitted from the global attention layer. # drop the events that is not emitted from the global attention layer.
if self.should_drop_event(event): if self.should_drop_event(event):
continue return
event_id = event["event_id"] event_id = event["event_id"]
data = event["data"] data = event["data"]
...@@ -512,14 +562,10 @@ class Publisher: ...@@ -512,14 +562,10 @@ class Publisher:
) )
elif self.kv_event_publisher: elif self.kv_event_publisher:
# No consolidator: publish to NATS (router subscribes directly) # No consolidator: publish to NATS (router subscribes directly)
self.kv_event_publisher.publish_removed( self.kv_event_publisher.publish_removed(event_id, removed_block_hashes)
event_id, removed_block_hashes
)
elif data["type"] == "created" and self.processing_initial_created_events: elif data["type"] == "created" and self.processing_initial_created_events:
self.update_max_window_size(event) self.update_max_window_size(event)
return True
def start(self): def start(self):
if ( if (
self.publish_kv_cache_events_thread self.publish_kv_cache_events_thread
......
...@@ -42,15 +42,16 @@ ARG ENABLE_MEDIA_FFMPEG ...@@ -42,15 +42,16 @@ ARG ENABLE_MEDIA_FFMPEG
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch" ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3" ARG PYTORCH_BASE_IMAGE_TAG="25.12-py3"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base" ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG RUNTIME_IMAGE_TAG="25.10-cuda13.0-runtime-ubuntu24.04" ARG RUNTIME_IMAGE_TAG="25.12-cuda13.1-runtime-ubuntu24.04"
# TensorRT-LLM specific configuration # TensorRT-LLM specific configuration
ARG HAS_TRTLLM_CONTEXT=0 ARG HAS_TRTLLM_CONTEXT=0
ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm" ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
ARG TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/" ARG TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
ARG GITHUB_TRTLLM_COMMIT ARG GITHUB_TRTLLM_COMMIT
ARG TRTLLM_WHEEL_IMAGE="trtllm_wheel_image_empty"
# SCCACHE configuration # SCCACHE configuration
ARG USE_SCCACHE ARG USE_SCCACHE
...@@ -78,9 +79,16 @@ ARG NIXL_LIBFABRIC_REF ...@@ -78,9 +79,16 @@ ARG NIXL_LIBFABRIC_REF
ARG ARCH=amd64 ARG ARCH=amd64
ARG ARCH_ALT=x86_64 ARG ARCH_ALT=x86_64
# Empty fallback for TRTLLM wheel image copy
FROM alpine:3.20 AS trtllm_wheel_image_empty
RUN mkdir -p /app/tensorrt_llm
# Copy artifacts from NGC PyTorch image # Copy artifacts from NGC PyTorch image
FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_IMAGE_TAG} AS pytorch_base FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_IMAGE_TAG} AS pytorch_base
# Resolve TRTLLM wheel image (can be a stage name or a registry image)
FROM ${TRTLLM_WHEEL_IMAGE} AS trtllm_wheel_image
################################## ##################################
########## Base Image ############ ########## Base Image ############
################################## ##################################
...@@ -509,13 +517,25 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \ ...@@ -509,13 +517,25 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}" PATH="/opt/dynamo/venv/bin:${PATH}"
# Copy pytorch installation from NGC PyTorch # Copy pytorch installation from NGC PyTorch
ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10 ARG FLASHINFER_PYTHON_VER=0.6.1
ARG TORCH_TENSORRT_VER=2.9.0a0 ARG PYTORCH_TRITON_VER=3.5.1+gitbfeb0668.nv25.12
ARG TORCHVISION_VER=0.24.0a0+094e7af5 ARG TORCHAO_VER=0.15.0+git01374eb5
ARG TORCHDATA_VER=0.11.0
ARG TORCHTITAN_VER=0.2.0
ARG TORCH_VER=2.10.0a0+b4e4ee81d3.nv25.12
ARG TORCH_TENSORRT_VER=2.10.0a0
ARG TORCHVISION_VER=0.25.0a0+ca221243
ARG JINJA2_VER=3.1.6 ARG JINJA2_VER=3.1.6
ARG SYMPY_VER=1.14.0 ARG SYMPY_VER=1.14.0
ARG FLASH_ATTN_VER=2.7.4.post1+25.10 ARG FLASH_ATTN_VER=2.7.4.post1+25.12
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchao ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchao
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchao-${TORCHAO_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchao-${TORCHAO_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchdata ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchdata
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchdata-${TORCHDATA_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchdata-${TORCHDATA_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchtitan ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchtitan
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchtitan-${TORCHTITAN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchtitan-${TORCHTITAN_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchgen ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchgen COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torchgen ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torchgen
...@@ -533,15 +553,16 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/fl ...@@ -533,15 +553,16 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/fl
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt
COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info
RUN uv pip install flashinfer-python==${FLASHINFER_PYTHON_VER}
# Install TensorRT-LLM and related dependencies # Install TensorRT-LLM and related dependencies
ARG HAS_TRTLLM_CONTEXT ARG HAS_TRTLLM_CONTEXT
ARG TENSORRTLLM_PIP_WHEEL ARG TENSORRTLLM_PIP_WHEEL
ARG TENSORRTLLM_INDEX_URL ARG TENSORRTLLM_INDEX_URL
ARG GITHUB_TRTLLM_COMMIT ARG GITHUB_TRTLLM_COMMIT
# Copy wheel build context (may be empty for download path)
# Copy only wheel files and commit info from trtllm_wheel stage from build_context COPY --from=trtllm_wheel / /trtllm_wheel/
COPY --from=trtllm_wheel /*.whl /trtllm_wheel/ COPY --from=trtllm_wheel_image /app/tensorrt_llm /trtllm_wheel_image/
COPY --from=trtllm_wheel /*.txt /trtllm_wheel/
RUN uv pip install --no-cache "cuda-python==13.0.2" RUN uv pip install --no-cache "cuda-python==13.0.2"
...@@ -555,39 +576,48 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ ...@@ -555,39 +576,48 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
rm -f /etc/apt/trusted.gpg.d/cuda*.gpg rm -f /etc/apt/trusted.gpg.d/cuda*.gpg
RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
# Download and run install_tensorrt.sh from TensorRT-LLM GitHub before installing the wheel
curl -fsSL --retry 5 --retry-delay 10 --max-time 1800 -o /tmp/install_tensorrt.sh "https://github.com/NVIDIA/TensorRT-LLM/raw/${GITHUB_TRTLLM_COMMIT}/docker/common/install_tensorrt.sh" && \
# Modify the script to use virtual environment pip instead of system pip3
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \
# Install from local wheel directory in build context # Install from local wheel directory in build context
WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \ WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
if [ -n "$WHEEL_FILE" ]; then \ if [ -n "$WHEEL_FILE" ]; then \
uv pip install --no-cache "$WHEEL_FILE" triton==3.5.0; \ uv pip install --no-cache "$WHEEL_FILE" triton==3.5.1; \
else \ else \
echo "No wheel file found in /trtllm_wheel directory."; \ echo "No wheel file found in /trtllm_wheel directory."; \
exit 1; \ exit 1; \
fi; \ fi; \
elif [ -n "$(find /trtllm_wheel_image -name "*.whl" | head -n 1)" ]; then \
# Install from wheel embedded in the TRTLLM release image
WHEEL_FILE="$(find /trtllm_wheel_image -name "*.whl" | head -n 1)"; \
uv pip install --no-cache "$WHEEL_FILE" triton==3.5.1; \
else \ else \
# Download and run install_tensorrt.sh from TensorRT-LLM GitHub before installing the wheel
TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/.*==([0-9a-zA-Z.+-]+).*/\1/') && \
(curl -fsSL --retry 5 --retry-delay 10 --max-time 1800 -o /tmp/install_tensorrt.sh "https://github.com/NVIDIA/TensorRT-LLM/raw/v${TRTLLM_VERSION}/docker/common/install_tensorrt.sh" || \
curl -fsSL --retry 5 --retry-delay 10 --max-time 1800 -o /tmp/install_tensorrt.sh "https://github.com/NVIDIA/TensorRT-LLM/raw/${GITHUB_TRTLLM_COMMIT}/docker/common/install_tensorrt.sh") && \
# Modify the script to use virtual environment pip instead of system pip3
sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
bash /tmp/install_tensorrt.sh && \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
# TRTLLM 1.2.0rc6.post2 has issues installing from pypi with uv, installing from direct wheel link works best # TRTLLM 1.2.0rc6.post2 has issues installing from pypi with uv, installing from direct wheel link works best
# explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason # explicitly installing triton 3.5.1 as trtllm only lists triton as dependency on x64_64 for some reason
if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \ if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \ TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \ PYTHON_TAG="cp$(echo ${PYTHON_VERSION} | tr -d '.')"; \
DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \ DIRECT_URL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-${TRTLLM_VERSION}-${PYTHON_TAG}-${PYTHON_TAG}-linux_${ARCH_ALT}.whl"; \
uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.0; \ uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${DIRECT_URL}" triton==3.5.1; \
else \ else \
uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" triton==3.5.0; \ uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" triton==3.5.1; \
fi; \ fi; \
fi fi && \
# Run TensorRT installer that ships with the TRTLLM wheel
TRT_INSTALLER="$(python -c "import glob, os, site; paths = []; \
paths += site.getsitepackages() if hasattr(site, 'getsitepackages') else []; \
user_site = site.getusersitepackages(); \
paths.append(user_site) if user_site else None; \
installer = ''; \
\
[installer:=matches[0] for base in paths \
for matches in [glob.glob(os.path.join(base, 'tensorrt_llm', '**', 'install_tensorrt.sh'), recursive=True)] \
if matches and not installer]; \
print(installer)")"; \
if [ -z "$TRT_INSTALLER" ]; then \
echo "No install_tensorrt.sh found inside tensorrt_llm package."; \
exit 1; \
fi; \
sed -i 's/pip3 install/uv pip install/g' "$TRT_INSTALLER"; \
bash "$TRT_INSTALLER"
################################################## ##################################################
########## Runtime Image ######################## ########## Runtime Image ########################
......
...@@ -74,7 +74,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")") ...@@ -74,7 +74,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
# Base Images # Base Images
TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
TRTLLM_BASE_IMAGE_TAG=25.10-py3 TRTLLM_BASE_IMAGE_TAG=25.12-py3
# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch, # Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source. # we need to build the TensorRT-LLM wheel from source.
...@@ -104,7 +104,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" ...@@ -104,7 +104,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided. # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI # Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit. # variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="50379d028c2689ffb5cefe7797c5afb199e9df93" # 1.2.0rc6.post2 DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="45d7022cc33903509fd8045bbc577d77dd1d3e2f" # 1.3.0rc1
TRTLLM_COMMIT="" TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TRTLLM_GIT_URL="" TRTLLM_GIT_URL=""
...@@ -113,8 +113,13 @@ TRTLLM_GIT_URL="" ...@@ -113,8 +113,13 @@ TRTLLM_GIT_URL=""
DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/" DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc6.post2" DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.3.0rc1"
# TensorRT-LLM wheels on PyPI might not be compatible with the NGC PyTorch.
# For incompatible versions, we install the wheel from the NGC image during the Docker build.
# The following versions are not ABI compatible with the NGC PyTorch.
TRTLLM_ABI_INCOMPATIBLE_VERSIONS=("1.3.0rc1")
TENSORRTLLM_PIP_WHEEL="" TENSORRTLLM_PIP_WHEEL=""
TRTLLM_WHEEL_IMAGE=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# FIXME: OPS-612 NCCL will hang with 25.03, so use 25.01 for now # FIXME: OPS-612 NCCL will hang with 25.03, so use 25.01 for now
...@@ -677,6 +682,50 @@ check_wheel_file() { ...@@ -677,6 +682,50 @@ check_wheel_file() {
return 0 return 0
} }
get_trtllm_version_from_pip_wheel() {
local wheel_spec="$1"
if [[ "$wheel_spec" =~ == ]]; then
local version
version=$(echo "$wheel_spec" | sed -n 's/.*==\([0-9a-zA-Z\.\-]*\).*/\1/p')
if _is_semver_ref "$version"; then
echo "${version#v}"
return 0
fi
fi
echo ""
return 0
}
trtllm_version_incompatible() {
local version="$1"
for incompatible_version in "${TRTLLM_ABI_INCOMPATIBLE_VERSIONS[@]}"; do
if [[ "$version" == "$incompatible_version" ]]; then
return 0
fi
done
return 1
}
_is_semver_ref() {
local ref="$1"
local semver_regex='^v?(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)([-+][0-9A-Za-z.-]+|[A-Za-z][0-9A-Za-z.-]+)?$'
[[ "$ref" =~ $semver_regex ]]
}
get_github_trtllm_ref() {
local commit="$1"
if _is_semver_ref "$commit"; then
if [[ "$commit" =~ ^v ]]; then
echo "$commit"
else
echo "v${commit}"
fi
return 0
fi
echo "$commit"
return 0
}
function determine_user_intention_trtllm() { function determine_user_intention_trtllm() {
# The tensorrt llm installation flags are not quite mutually exclusive # The tensorrt llm installation flags are not quite mutually exclusive
# since the user should be able to point at a directory of their choosing # since the user should be able to point at a directory of their choosing
...@@ -764,15 +813,22 @@ if [[ $FRAMEWORK == "TRTLLM" ]]; then ...@@ -764,15 +813,22 @@ if [[ $FRAMEWORK == "TRTLLM" ]]; then
if [[ "$TRTLLM_INTENTION" == "download" ]]; then if [[ "$TRTLLM_INTENTION" == "download" ]]; then
TENSORRTLLM_INDEX_URL=${TENSORRTLLM_INDEX_URL:-$DEFAULT_TENSORRTLLM_INDEX_URL} TENSORRTLLM_INDEX_URL=${TENSORRTLLM_INDEX_URL:-$DEFAULT_TENSORRTLLM_INDEX_URL}
TENSORRTLLM_PIP_WHEEL=${TENSORRTLLM_PIP_WHEEL:-$DEFAULT_TENSORRTLLM_PIP_WHEEL} TENSORRTLLM_PIP_WHEEL=${TENSORRTLLM_PIP_WHEEL:-$DEFAULT_TENSORRTLLM_PIP_WHEEL}
TRTLLM_WHEEL_VERSION=$(get_trtllm_version_from_pip_wheel "${TENSORRTLLM_PIP_WHEEL}")
if trtllm_version_incompatible "${TRTLLM_WHEEL_VERSION}"; then
TRTLLM_WHEEL_IMAGE="nvcr.io/nvidia/tensorrt-llm/release:${TRTLLM_WHEEL_VERSION}"
BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=0"
BUILD_ARGS+=" --build-arg TRTLLM_WHEEL_IMAGE=${TRTLLM_WHEEL_IMAGE}"
PRINT_TRTLLM_WHEEL_FILE=${TRTLLM_WHEEL_IMAGE}
else
BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=0" BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=0"
BUILD_ARGS+=" --build-arg TENSORRTLLM_PIP_WHEEL=${TENSORRTLLM_PIP_WHEEL}" BUILD_ARGS+=" --build-arg TENSORRTLLM_PIP_WHEEL=${TENSORRTLLM_PIP_WHEEL}"
BUILD_ARGS+=" --build-arg TENSORRTLLM_INDEX_URL=${TENSORRTLLM_INDEX_URL}" BUILD_ARGS+=" --build-arg TENSORRTLLM_INDEX_URL=${TENSORRTLLM_INDEX_URL}"
PRINT_TRTLLM_WHEEL_FILE=${TENSORRTLLM_PIP_WHEEL}
fi
# Create a dummy directory to satisfy the build context requirement # Create a dummy directory to satisfy the build context requirement
# There is no way to conditionally copy the build context in dockerfile. # There is no way to conditionally copy the build context in dockerfile.
mkdir -p /tmp/dummy_dir mkdir -p /tmp/trtllm_wheel_context
BUILD_CONTEXT_ARG+=" --build-context trtllm_wheel=/tmp/dummy_dir" BUILD_CONTEXT_ARG+=" --build-context trtllm_wheel=/tmp/trtllm_wheel_context"
PRINT_TRTLLM_WHEEL_FILE=${TENSORRTLLM_PIP_WHEEL}
elif [[ "$TRTLLM_INTENTION" == "install" ]]; then elif [[ "$TRTLLM_INTENTION" == "install" ]]; then
echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}" echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}"; then if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}"; then
...@@ -811,7 +867,11 @@ if [[ $FRAMEWORK == "TRTLLM" ]]; then ...@@ -811,7 +867,11 @@ if [[ $FRAMEWORK == "TRTLLM" ]]; then
if [[ -z "$TRTLLM_COMMIT" ]]; then if [[ -z "$TRTLLM_COMMIT" ]]; then
# Attempt to default since the commit will work with a hash or a tag/branch # Attempt to default since the commit will work with a hash or a tag/branch
if [[ ! -z "$TENSORRTLLM_PIP_WHEEL" ]]; then if [[ ! -z "$TENSORRTLLM_PIP_WHEEL" ]]; then
TRTLLM_COMMIT=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -n 's/.*==\([0-9a-zA-Z\.\-]*\).*/\1/p') TRTLLM_COMMIT=$(get_trtllm_version_from_pip_wheel "${TENSORRTLLM_PIP_WHEEL}")
if [[ -z "$TRTLLM_COMMIT" ]]; then
echo -e "[ERROR] Could not parse a semver version from TENSORRTLLM_PIP_WHEEL: ${TENSORRTLLM_PIP_WHEEL}"
exit 1
fi
echo "Attempting to default TRTLLM_COMMIT to \"$TRTLLM_COMMIT\" for installation of TensorRT." echo "Attempting to default TRTLLM_COMMIT to \"$TRTLLM_COMMIT\" for installation of TensorRT."
else else
echo -e "[ERROR] TRTLLM framework was set as a target but the TRTLLM_COMMIT variable was not set." echo -e "[ERROR] TRTLLM framework was set as a target but the TRTLLM_COMMIT variable was not set."
...@@ -820,7 +880,8 @@ if [[ $FRAMEWORK == "TRTLLM" ]]; then ...@@ -820,7 +880,8 @@ if [[ $FRAMEWORK == "TRTLLM" ]]; then
exit 1 exit 1
fi fi
fi fi
BUILD_ARGS+=" --build-arg GITHUB_TRTLLM_COMMIT=${TRTLLM_COMMIT}" GITHUB_TRTLLM_REF=$(get_github_trtllm_ref "${TRTLLM_COMMIT}")
BUILD_ARGS+=" --build-arg GITHUB_TRTLLM_COMMIT=${GITHUB_TRTLLM_REF}"
fi fi
......
...@@ -53,7 +53,7 @@ tensorboard>=2.19.0,<2.21.0 ...@@ -53,7 +53,7 @@ tensorboard>=2.19.0,<2.21.0
tensorboardX==2.6.2.2 tensorboardX==2.6.2.2
# Transformers version constraint for container builds # Transformers version constraint for container builds
# - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
# - TensorRT-LLM 1.2.0rc6.post2: ==4.57.1 # - TensorRT-LLM 1.3.0rc1: ==4.57.1
# - SGLang 0.5.8: ==4.57.1 # - SGLang 0.5.8: ==4.57.1
# Using >=4.56.0 to satisfy all frameworks # Using >=4.56.0 to satisfy all frameworks
transformers>=4.56.0 transformers>=4.56.0
......
...@@ -18,7 +18,7 @@ The following table shows the backend framework versions included with each Dyna ...@@ -18,7 +18,7 @@ The following table shows the backend framework versions included with each Dyna
| :------------- | :------------- | :--------------- | :------------------ | :--------- | :--------- | :--------------- | :--------- | | :------------- | :------------- | :--------------- | :------------------ | :--------- | :--------- | :--------------- | :--------- |
| vLLM | `0.14.1` | `0.12.0` | `0.12.0` | `0.12.0` | `0.11.0` | `0.11.0` | `0.11.0` | | vLLM | `0.14.1` | `0.12.0` | `0.12.0` | `0.12.0` | `0.11.0` | `0.11.0` | `0.11.0` |
| SGLang | `0.5.8` | `0.5.6.post2` | `0.5.6.post2` | `0.5.6.post2` | `0.5.3.post4` | `0.5.3.post4` | `0.5.3.post4` | | SGLang | `0.5.8` | `0.5.6.post2` | `0.5.6.post2` | `0.5.6.post2` | `0.5.3.post4` | `0.5.3.post4` | `0.5.3.post4` |
| TensorRT-LLM | `1.2.0rc6.post2` | `1.2.0rc6.post2` | `1.2.0rc6.post1` | `1.2.0rc6.post1` | `1.2.0rc3` | `1.2.0rc3` | `1.2.0rc2` | | TensorRT-LLM | `1.3.0rc1` | `1.2.0rc6.post2` | `1.2.0rc6.post1` | `1.2.0rc6.post1` | `1.2.0rc3` | `1.2.0rc3` | `1.2.0rc2` |
| NIXL | `0.9.0` | `0.8.0` | `0.8.0` | `0.8.0` | `0.8.0` | `0.8.0` | `0.8.0` | | NIXL | `0.9.0` | `0.8.0` | `0.8.0` | `0.8.0` | `0.8.0` | `0.8.0` | `0.8.0` |
**main (ToT)** reflects the current development branch. **v0.8.1.post1** is a patch release for PyPI wheels and TRT-LLM container only (no GitHub release). **main (ToT)** reflects the current development branch. **v0.8.1.post1** is a patch release for PyPI wheels and TRT-LLM container only (no GitHub release).
......
...@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" ...@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies] [project.optional-dependencies]
trtllm =[ trtllm =[
"uvloop", "uvloop",
"tensorrt-llm==1.2.0rc6.post2", "tensorrt-llm==1.3.0rc1",
] ]
vllm = [ vllm = [
......
...@@ -1835,7 +1835,7 @@ def _test_router_decisions_disagg( ...@@ -1835,7 +1835,7 @@ def _test_router_decisions_disagg(
verify_response_timing(timing_info) verify_response_timing(timing_info)
# Small delay between requests # Small delay between requests
await asyncio.sleep(0.5) await asyncio.sleep(1)
return prefill_worker_ids, decode_worker_ids return prefill_worker_ids, decode_worker_ids
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment