Unverified Commit 47d05d7a authored by richardhuo-nv's avatar richardhuo-nv Committed by GitHub
Browse files

build: DIS-148 use the tensorrt_llm public wheel from pypi by default in container build (#1525)

parent ce48a863
...@@ -88,14 +88,15 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" ...@@ -88,14 +88,15 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided. # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI # Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit. # variables to learn how to run a pipeline with a specific commit.
TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3" DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
TRTLLM_COMMIT=""
# TensorRT-LLM PyPI index URL # TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==0.21.0rc0"
TENSORRTLLM_PIP_WHEEL="" TENSORRTLLM_PIP_WHEEL=""
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# FIXME: NCCL will hang with 25.03, so use 25.01 for now # FIXME: NCCL will hang with 25.03, so use 25.01 for now
# Please check https://github.com/ai-dynamo/dynamo/pull/1065 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
...@@ -158,6 +159,13 @@ get_options() { ...@@ -158,6 +159,13 @@ get_options() {
missing_requirement "$1" missing_requirement "$1"
fi fi
;; ;;
--use-default-experimental-tensorrtllm-commit)
if [ -n "$2" ] && [[ "$2" != --* ]]; then
echo "ERROR: --use-default-experimental-tensorrtllm-commit does not take any argument"
exit 1
fi
USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
;;
--tensorrtllm-pip-wheel) --tensorrtllm-pip-wheel)
if [ "$2" ]; then if [ "$2" ]; then
TENSORRTLLM_PIP_WHEEL=$2 TENSORRTLLM_PIP_WHEEL=$2
...@@ -344,6 +352,7 @@ show_help() { ...@@ -344,6 +352,7 @@ show_help() {
echo " [--framework framework one of ${!FRAMEWORKS[*]}]" echo " [--framework framework one of ${!FRAMEWORKS[*]}]"
echo " [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]" echo " [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]"
echo " [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]" echo " [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]"
echo " [--use-default-experimental-tensorrtllm-commit] Use the default experimental commit (${DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT}) to build TensorRT-LLM. This is a flag (no argument). Do not combine with --tensorrtllm-commit or --tensorrtllm-pip-wheel."
echo " [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]" echo " [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]"
echo " [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]" echo " [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]"
echo " [--build-arg additional build args to pass to docker build]" echo " [--build-arg additional build args to pass to docker build]"
...@@ -475,6 +484,19 @@ check_wheel_file() { ...@@ -475,6 +484,19 @@ check_wheel_file() {
} }
if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
if [ "$USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT" = true ]; then
if [ -n "$TRTLLM_COMMIT" ] || [ -n "$TENSORRTLLM_PIP_WHEEL" ]; then
echo "ERROR: When using --use-default-experimental-trtllm-commit, do not set --tensorrtllm-commit or --tensorrtllm-pip-wheel."
exit 1
fi
TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
fi
# If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
fi
if [ -z "${TENSORRTLLM_PIP_WHEEL}" ]; then if [ -z "${TENSORRTLLM_PIP_WHEEL}" ]; then
# Use option 1 # Use option 1
if [ ! -d "${TENSORRTLLM_PIP_WHEEL_DIR}" ]; then if [ ! -d "${TENSORRTLLM_PIP_WHEEL_DIR}" ]; then
......
...@@ -62,6 +62,11 @@ apt-get update && apt-get -y install git git-lfs ...@@ -62,6 +62,11 @@ apt-get update && apt-get -y install git git-lfs
# On an ARM machine: # On an ARM machine:
./container/build.sh --framework tensorrtllm --platform linux/arm64 ./container/build.sh --framework tensorrtllm --platform linux/arm64
# Build the container with the default experimental TensorRT-LLM commit
# WARNING: This is for experimental feature testing only.
# The container should not be used in a production environment.
./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
``` ```
> [!NOTE] > [!NOTE]
...@@ -136,6 +141,10 @@ dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_agg.yaml ...@@ -136,6 +141,10 @@ dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_agg.yaml
``` ```
Notes: Notes:
- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script.
Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit`
- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. - There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. - MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
...@@ -275,6 +284,9 @@ dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/deeps ...@@ -275,6 +284,9 @@ dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/deeps
``` ```
Notes: Notes:
- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script.
Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit`
- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. - There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. - MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment