"lib/runtime/vscode:/vscode.git/clone" did not exist on "113f4d91259ecc01933f6732d8915a7ae86cacb4"
Unverified Commit 7dd79013 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

build: Cleans the TensorRTLLM + Dynamo container build (#968)


Signed-off-by: default avatarTanmay Verma <tanmay2592@gmail.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 412ec843
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
ARG BASE_IMAGE="tensorrt_llm/release" ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG BASE_IMAGE_TAG="latest_squashed" ARG BASE_IMAGE_TAG="25.03-py3"
ARG RELEASE_BUILD ARG RELEASE_BUILD
# Define general architecture ARGs for supporting both x86 and aarch64 builds. # Define general architecture ARGs for supporting both x86 and aarch64 builds.
...@@ -59,10 +59,31 @@ RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$E ...@@ -59,10 +59,31 @@ RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$E
rm /tmp/etcd.tar.gz rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd/:$PATH ENV PATH=/usr/local/bin/etcd/:$PATH
# TODO: Try using uv to install tensorrtllm ARG HAS_TRTLLM_CONTEXT=0
ARG TENSORRTLLM_PIP_WHEEL_PATH="" ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
COPY ${TENSORRTLLM_PIP_WHEEL_PATH}/*.whl /tmp/ ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
RUN find /tmp -name "*.whl" -exec pip install {} +
COPY --from=trtllm_wheel . /trtllm_wheel/
# TODO: Currently, ABI compatibility issues with TRTLLM wheel and NGC PyTorch prevent us
# from using the TRTLLM wheel in a uv venv. Once the issues are resolved, we can
# use uv to install TensorRT-LLM wheel within the uv venv.
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
# Install from local wheel directory in build context
WHEEL_FILE=$(find /trtllm_wheel -name "*.whl" | head -n 1); \
if [ -n "$WHEEL_FILE" ]; then \
pip install "$WHEEL_FILE"; \
else \
echo "No wheel file found in /trtllm_wheel directory."; \
exit 1; \
fi; \
else \
# Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
pip install --index-url "${TENSORRTLLM_INDEX_URL}" \
--extra-index-url https://pypi.org/simple \
"${TENSORRTLLM_PIP_WHEEL}" ; \
fi
# Install genai-perf for benchmarking # Install genai-perf for benchmarking
# TODO: Move to published pypi tags # TODO: Move to published pypi tags
...@@ -90,7 +111,8 @@ RUN apt-get update && \ ...@@ -90,7 +111,8 @@ RUN apt-get update && \
protobuf-compiler \ protobuf-compiler \
cmake \ cmake \
libssl-dev \ libssl-dev \
pkg-config pkg-config \
libclang-dev
ENV RUSTUP_HOME=/usr/local/rustup \ ENV RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \ CARGO_HOME=/usr/local/cargo \
...@@ -237,12 +259,7 @@ ENV DYNAMO_HOME=/workspace ...@@ -237,12 +259,7 @@ ENV DYNAMO_HOME=/workspace
# Use UCX for TRTLLM KV Cache Transfer # Use UCX for TRTLLM KV Cache Transfer
ENV TRTLLM_USE_UCX_KVCACHE=1 ENV TRTLLM_USE_UCX_KVCACHE=1
# Needed to use NVLink for TRTLLM KV Cache Transfer
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md
ENV UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
ENV UCX_CUDA_COPY_DMABUF=no
ENV UCX_MEMTYPE_CACHE=no
ENV UCX_RNDV_PIPELINE_ERROR_HANDLING=y
# Copy launch banner # Copy launch banner
RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
......
...@@ -57,9 +57,43 @@ DOCKERFILE=${SOURCE_DIR}/Dockerfile ...@@ -57,9 +57,43 @@ DOCKERFILE=${SOURCE_DIR}/Dockerfile
BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")") BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
# Base Images # Base Images
TENSORRTLLM_BASE_IMAGE=tensorrt_llm/release TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
TENSORRTLLM_BASE_IMAGE_TAG=latest_squashed TENSORRTLLM_BASE_IMAGE_TAG=25.03-py3
TENSORRTLLM_PIP_WHEEL_PATH=""
# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
# we need to build the TensorRT-LLM wheel from source.
#
# There are two ways to build the dynamo image with TensorRT-LLM.
# 1. Use the local TensorRT-LLM wheel directory.
# 2. Use the TensorRT-LLM wheel on artifactory.
#
# If using option 1, the TENSORRTLLM_PIP_WHEEL_DIR must be a path to a directory
# containing TensorRT-LLM wheel file along with commit.txt file with the
# <arch>_<commit ID> as contents. If no valid trtllm wheel is found, the script
# will attempt to build the wheel from source and store the built wheel in the
# specified directory. TRTLLM_COMMIT from the TensorRT-LLM main branch will be
# used to build the wheel.
#
# If using option 2, the TENSORRTLLM_PIP_WHEEL must be the TensorRT-LLM wheel
# package that will be installed from the specified TensorRT-LLM PyPI Index URL.
# This option will ignore the TRTLLM_COMMIT option. As the TensorRT-LLM wheel from PyPI
# is not ABI compatible with NGC PyTorch, you can use TENSORRTLLM_INDEX_URL to specify
# a private PyPI index URL which has your pre-built TensorRT-LLM wheel.
#
# By default, we will use option 1. If you want to use option 2, you can set
# TENSORRTLLM_PIP_WHEEL to the TensorRT-LLM wheel on artifactory.
#
# Path to the local TensorRT-LLM wheel directory or the wheel on artifactory.
TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit.
TRTLLM_COMMIT=83f37614ef735d251281136c3c05b1fecf8ef68b
# TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
TENSORRTLLM_PIP_WHEEL=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
VLLM_BASE_IMAGE_TAG="25.03-cuda12.8-devel-ubuntu24.04" VLLM_BASE_IMAGE_TAG="25.03-cuda12.8-devel-ubuntu24.04"
...@@ -70,6 +104,8 @@ NONE_BASE_IMAGE_TAG="24.04" ...@@ -70,6 +104,8 @@ NONE_BASE_IMAGE_TAG="24.04"
NIXL_COMMIT=d247e88c72db75dc00e4e37aa21ed8d99e60c27d NIXL_COMMIT=d247e88c72db75dc00e4e37aa21ed8d99e60c27d
NIXL_REPO=ai-dynamo/nixl.git NIXL_REPO=ai-dynamo/nixl.git
NO_CACHE=""
get_options() { get_options() {
while :; do while :; do
case $1 in case $1 in
...@@ -93,9 +129,33 @@ get_options() { ...@@ -93,9 +129,33 @@ get_options() {
missing_requirement "$1" missing_requirement "$1"
fi fi
;; ;;
--tensorrtllm-pip-wheel-path) --tensorrtllm-pip-wheel-dir)
if [ "$2" ]; then if [ "$2" ]; then
TENSORRTLLM_PIP_WHEEL_PATH=$2 TENSORRTLLM_PIP_WHEEL_DIR=$2
shift
else
missing_requirement "$1"
fi
;;
--tensorrtllm-commit)
if [ "$2" ]; then
TRTLLM_COMMIT=$2
shift
else
missing_requirement "$1"
fi
;;
--tensorrtllm-pip-wheel)
if [ "$2" ]; then
TENSORRTLLM_PIP_WHEEL=$2
shift
else
missing_requirement "$1"
fi
;;
--tensorrtllm-index-url)
if [ "$2" ]; then
TENSORRTLLM_INDEX_URL=$2
shift shift
else else
missing_requirement "$1" missing_requirement "$1"
...@@ -252,7 +312,7 @@ show_image_options() { ...@@ -252,7 +312,7 @@ show_image_options() {
echo " Base: '${BASE_IMAGE}'" echo " Base: '${BASE_IMAGE}'"
echo " Base_Image_Tag: '${BASE_IMAGE_TAG}'" echo " Base_Image_Tag: '${BASE_IMAGE_TAG}'"
if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
echo " Tensorrtllm_Pip_Wheel_Path: '${TENSORRTLLM_PIP_WHEEL_PATH}'" echo " Tensorrtllm_Pip_Wheel: '${TENSORRTLLM_PIP_WHEEL}'"
fi fi
echo " Build Context: '${BUILD_CONTEXT}'" echo " Build Context: '${BUILD_CONTEXT}'"
echo " Build Arguments: '${BUILD_ARGS}'" echo " Build Arguments: '${BUILD_ARGS}'"
...@@ -266,7 +326,10 @@ show_help() { ...@@ -266,7 +326,10 @@ show_help() {
echo " [--base-image-tag base image tag]" echo " [--base-image-tag base image tag]"
echo " [--platform platform for docker build" echo " [--platform platform for docker build"
echo " [--framework framework one of ${!FRAMEWORKS[*]}]" echo " [--framework framework one of ${!FRAMEWORKS[*]}]"
echo " [--tensorrtllm-pip-wheel-path path to tensorrtllm pip wheel]" echo " [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]"
echo " [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]"
echo " [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]"
echo " [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]"
echo " [--build-arg additional build args to pass to docker build]" echo " [--build-arg additional build args to pass to docker build]"
echo " [--cache-from cache location to start from]" echo " [--cache-from cache location to start from]"
echo " [--cache-to location where to cache the build output]" echo " [--cache-to location where to cache the build output]"
...@@ -289,7 +352,9 @@ error() { ...@@ -289,7 +352,9 @@ error() {
get_options "$@" get_options "$@"
# Automatically set ARCH and ARCH_ALT if PLATFORM is linux/arm64 # Automatically set ARCH and ARCH_ALT if PLATFORM is linux/arm64
ARCH="amd64"
if [[ "$PLATFORM" == *"linux/arm64"* ]]; then if [[ "$PLATFORM" == *"linux/arm64"* ]]; then
ARCH="arm64"
BUILD_ARGS+=" --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 " BUILD_ARGS+=" --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 "
fi fi
...@@ -349,9 +414,73 @@ if [ -n "${GITLAB_TOKEN}" ]; then ...@@ -349,9 +414,73 @@ if [ -n "${GITLAB_TOKEN}" ]; then
BUILD_ARGS+=" --build-arg GITLAB_TOKEN=${GITLAB_TOKEN} " BUILD_ARGS+=" --build-arg GITLAB_TOKEN=${GITLAB_TOKEN} "
fi fi
check_wheel_file() {
local wheel_dir="$1"
# Check if directory exists
if [ ! -d "$wheel_dir" ]; then
echo "Error: Directory '$wheel_dir' does not exist"
return 1
fi
# Look for .whl files
wheel_count=$(find "$wheel_dir" -name "*.whl" | wc -l)
if [ "$wheel_count" -eq 0 ]; then
echo "WARN: No .whl files found in '$wheel_dir'"
return 1
elif [ "$wheel_count" -gt 1 ]; then
echo "Warning: Multiple wheel files found in '$wheel_dir'. Will use first one found."
find "$wheel_dir" -name "*.whl" | head -n 1
return 0
else
echo "Found $wheel_count wheel files in '$wheel_dir'"
# Check if commit file exists
commit_file="$wheel_dir/commit.txt"
if [ ! -f "$commit_file" ]; then
echo "Error: Commit file '$commit_file' does not exist"
return 1
fi
# Check if commit ID matches, otherwise re-build the wheel
# Commit ID is of the form <arch>_<commit_id>
commit_id=$(cat "$commit_file")
if [ "$commit_id" != "$2" ]; then
echo "Error: Commit ID mismatch. Expected '$2', got '$commit_id'"
rm -rf $wheel_dir/*.whl
return 1
fi
return 0
fi
}
if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
if [ -n "${TENSORRTLLM_PIP_WHEEL_PATH}" ]; then if [ -z "${TENSORRTLLM_PIP_WHEEL}" ]; then
BUILD_ARGS+=" --build-arg TENSORRTLLM_PIP_WHEEL_PATH=${TENSORRTLLM_PIP_WHEEL_PATH} " # Use option 1
if [ ! -d "${TENSORRTLLM_PIP_WHEEL_DIR}" ]; then
# Create the directory if it doesn't exist
mkdir -p ${TENSORRTLLM_PIP_WHEEL_DIR}
fi
BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=1"
echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
error "ERROR: Failed to build TensorRT-LLM wheel"
fi
fi
echo "Installing TensorRT-LLM from local wheel directory"
BUILD_CONTEXT_ARG+=" --build-context trtllm_wheel=${TENSORRTLLM_PIP_WHEEL_DIR}"
else
BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=0"
BUILD_ARGS+=" --build-arg TENSORRTLLM_PIP_WHEEL=${TENSORRTLLM_PIP_WHEEL}"
BUILD_ARGS+=" --build-arg TENSORRTLLM_INDEX_URL=${TENSORRTLLM_INDEX_URL}"
# Create a dummy directory to satisfy the build context requirement
# There is no way to conditionally copy the build context in dockerfile.
mkdir -p /tmp/dummy_dir
BUILD_CONTEXT_ARG+=" --build-context trtllm_wheel=/tmp/dummy_dir"
fi fi
fi fi
...@@ -374,19 +503,6 @@ if [ -z "$RUN_PREFIX" ]; then ...@@ -374,19 +503,6 @@ if [ -z "$RUN_PREFIX" ]; then
set -x set -x
fi fi
# Check if the TensorRT-LLM base image exists
if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
if docker inspect --type=image "$BASE_IMAGE:$BASE_IMAGE_TAG" > /dev/null 2>&1; then
echo "Image '$BASE_IMAGE:$BASE_IMAGE_TAG' is found."
else
echo "Image '$BASE_IMAGE:$BASE_IMAGE_TAG' is not found." >&2
echo "Please build the TensorRT-LLM base image first. Run ./build_trtllm_base_image.sh" >&2
echo "or use --base-image and --base-image-tag to an existing TensorRT-LLM base image." >&2
echo "See https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html for more information." >&2
exit 1
fi
fi
$RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE $RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
{ set +x; } 2>/dev/null { set +x; } 2>/dev/null
......
...@@ -14,22 +14,28 @@ ...@@ -14,22 +14,28 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Build the TRT-LLM base image. # Build the TRT-LLM wheel.
# This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM. # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
TRTLLM_COMMIT=dfbcb543
while getopts "c:" opt; do while getopts "c:o:a:" opt; do
case ${opt} in case ${opt} in
c) TRTLLM_COMMIT=$OPTARG ;; c) TRTLLM_COMMIT=$OPTARG ;;
*) echo "Invalid option" ;; o) OUTPUT_DIR=$OPTARG ;;
a) ARCH=$OPTARG ;;
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
echo " -c: TensorRT-LLM commit to build"
echo " -o: Output directory for wheel files"
echo " -a: Architecture (amd64 or arm64)"
exit 1 ;;
esac esac
done done
python3 -m venv /tmp/squash-env # Set default output directory if not specified
if [ -z "$OUTPUT_DIR" ]; then
OUTPUT_DIR="/tmp/trtllm_wheel"
fi
source /tmp/squash-env/bin/activate
pip3 install docker-squash
(cd /tmp && \ (cd /tmp && \
# Clone the TensorRT-LLM repository. # Clone the TensorRT-LLM repository.
...@@ -50,9 +56,20 @@ git submodule update --init --recursive ...@@ -50,9 +56,20 @@ git submodule update --init --recursive
git lfs pull git lfs pull
# Build the TRT-LLM base image. # Build the TRT-LLM base image.
make -C docker release_build) make -C docker wheel_build
# Copy the wheel to the host
mkdir -p $OUTPUT_DIR
docker create --name trtllm_wheel_container docker.io/tensorrt_llm/wheel:latest
docker cp trtllm_wheel_container:/src/tensorrt_llm/build $OUTPUT_DIR/
cp $OUTPUT_DIR/build/*.whl $OUTPUT_DIR/
docker rm trtllm_wheel_container || true
)
pip3 install docker-squash # Store the commit hash in the output directory to ensure the wheel is built from the correct commit.
docker-squash -t tensorrt_llm/release:latest_squashed tensorrt_llm/release:latest rm -rf $OUTPUT_DIR/commit.txt
echo ${ARCH}_${TRTLLM_COMMIT} > $OUTPUT_DIR/commit.txt
deactivate echo "TRT-LLM wheel built successfully."
\ No newline at end of file ls -al $OUTPUT_DIR
\ No newline at end of file
...@@ -42,30 +42,10 @@ docker compose -f deploy/metrics/docker-compose.yml up -d ...@@ -42,30 +42,10 @@ docker compose -f deploy/metrics/docker-compose.yml up -d
### Build docker ### Build docker
#### Step 1: Build TensorRT-LLM base container image
Because of the known issue of C++11 ABI compatibility within the NGC pytorch container, we rebuild TensorRT-LLM from source.
See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html) for more informantion.
Use the helper script to build a TensorRT-LLM container base image. The script uses a specific commit id from TensorRT-LLM main branch.
```bash ```bash
# TensorRT-LLM uses git-lfs, which needs to be installed in advance. # TensorRT-LLM uses git-lfs, which needs to be installed in advance.
apt-get update && apt-get -y install git git-lfs apt-get update && apt-get -y install git git-lfs
# The script uses python packages like docker-squash to squash image
# layers within trtllm base image
DEBIAN_FRONTEND=noninteractive TZ=America/Los_Angeles apt-get -y install python3 python3-pip python3-venv
./container/build_trtllm_base_image.sh
```
For more information see [here](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html#option-1-build-tensorrt-llm-in-one-step) for more details on building from source.
If you already have a TensorRT-LLM container image, you can skip this step.
#### Step 2: Build the Dynamo container
```
# On an x86 machine: # On an x86 machine:
./container/build.sh --framework tensorrtllm ./container/build.sh --framework tensorrtllm
...@@ -73,14 +53,15 @@ If you already have a TensorRT-LLM container image, you can skip this step. ...@@ -73,14 +53,15 @@ If you already have a TensorRT-LLM container image, you can skip this step.
./container/build.sh --framework tensorrtllm --platform linux/arm64 ./container/build.sh --framework tensorrtllm --platform linux/arm64
``` ```
This build script internally points to the base container image built with step 1. If you skipped previous step because you already have the container image available, you can run the build script with that image as a base. > [!NOTE]
> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
> for more informantion.
>
> Hence, when running this script for the first time, the time taken by this script can be
> quite long.
```bash
# Build dynamo image with other TRTLLM base image.
./container/build.sh --framework TENSORRTLLM --base-image <trtllm-base-image> --base-image-tag <trtllm-base-image-tag>
```
### Run container ### Run container
``` ```
......
...@@ -38,6 +38,7 @@ from tensorrt_llm.serve.openai_protocol import ( ...@@ -38,6 +38,7 @@ from tensorrt_llm.serve.openai_protocol import (
class DynamoTRTLLMCompletionRequest(CompletionRequest): class DynamoTRTLLMCompletionRequest(CompletionRequest):
id: str = Field(default_factory=lambda: f"cmpl-{str(uuid.uuid4().hex)}") id: str = Field(default_factory=lambda: f"cmpl-{str(uuid.uuid4().hex)}")
max_completion_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None
nvext: Optional[dict] = Field(default=None)
class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest): class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
...@@ -45,6 +46,7 @@ class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest): ...@@ -45,6 +46,7 @@ class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
max_completion_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None
max_tokens: Optional[int] = None max_tokens: Optional[int] = None
disaggregated_params: Optional[DisaggregatedParams] = Field(default=None) disaggregated_params: Optional[DisaggregatedParams] = Field(default=None)
nvext: Optional[dict] = Field(default=None)
class Tokens(BaseModel): class Tokens(BaseModel):
...@@ -74,6 +76,7 @@ class TRTLLMWorkerResponseOutput: ...@@ -74,6 +76,7 @@ class TRTLLMWorkerResponseOutput:
text: str text: str
token_ids: list[int] token_ids: list[int]
logprobs: Optional[List[float]] = None logprobs: Optional[List[float]] = None
prompt_logprobs: Optional[List[float]] = None
cumulative_logprob: Optional[float] = None cumulative_logprob: Optional[float] = None
finish_reason: Optional[Literal["stop", "length", "timeout", "cancelled"]] = None finish_reason: Optional[Literal["stop", "length", "timeout", "cancelled"]] = None
stop_reason: Optional[Union[int, str]] = None stop_reason: Optional[Union[int, str]] = None
...@@ -101,8 +104,6 @@ class TRTLLMWorkerResponse(BaseModel): ...@@ -101,8 +104,6 @@ class TRTLLMWorkerResponse(BaseModel):
prompt_token_ids: list[int] prompt_token_ids: list[int]
outputs: list[dict] outputs: list[dict]
finished: bool finished: bool
# TODO
# prompt_logprobs: list[float]
class DisaggregatedTypeConverter: class DisaggregatedTypeConverter:
......
...@@ -31,6 +31,8 @@ context_servers: ...@@ -31,6 +31,8 @@ context_servers:
enable_chunked_prefill: false enable_chunked_prefill: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.40
cache_transceiver_config:
max_num_tokens: 10240
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: false enable_overlap_scheduler: false
use_cuda_graph: false use_cuda_graph: false
...@@ -44,6 +46,8 @@ generation_servers: ...@@ -44,6 +46,8 @@ generation_servers:
max_batch_size: 256 max_batch_size: 256
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.40
cache_transceiver_config:
max_num_tokens: 256
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true enable_overlap_scheduler: true
use_cuda_graph: false use_cuda_graph: false
......
...@@ -33,6 +33,8 @@ context_servers: ...@@ -33,6 +33,8 @@ context_servers:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.40
event_buffer_max_size: 1024 event_buffer_max_size: 1024
enable_block_reuse: true enable_block_reuse: true
cache_transceiver_config:
max_num_tokens: 10240
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: false enable_overlap_scheduler: false
use_cuda_graph: false use_cuda_graph: false
...@@ -49,6 +51,8 @@ generation_servers: ...@@ -49,6 +51,8 @@ generation_servers:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.40
event_buffer_max_size: 1024 event_buffer_max_size: 1024
enable_block_reuse: true enable_block_reuse: true
cache_transceiver_config:
max_num_tokens: 256
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true enable_overlap_scheduler: true
use_cuda_graph: false use_cuda_graph: false
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment