echo"ERROR: --use-default-experimental-tensorrtllm-commit does not take any argument"
exit 1
fi
USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
;;
--tensorrtllm-pip-wheel)
--tensorrtllm-pip-wheel)
if["$2"];then
if["$2"];then
TENSORRTLLM_PIP_WHEEL=$2
TENSORRTLLM_PIP_WHEEL=$2
...
@@ -344,6 +352,7 @@ show_help() {
...
@@ -344,6 +352,7 @@ show_help() {
echo" [--framework framework one of ${!FRAMEWORKS[*]}]"
echo" [--framework framework one of ${!FRAMEWORKS[*]}]"
echo" [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]"
echo" [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]"
echo" [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]"
echo" [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]"
echo" [--use-default-experimental-tensorrtllm-commit] Use the default experimental commit (${DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT}) to build TensorRT-LLM. This is a flag (no argument). Do not combine with --tensorrtllm-commit or --tensorrtllm-pip-wheel."
echo" [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]"
echo" [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]"
echo" [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]"
echo" [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]"
echo" [--build-arg additional build args to pass to docker build]"
echo" [--build-arg additional build args to pass to docker build]"
@@ -136,6 +141,10 @@ dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_agg.yaml
...
@@ -136,6 +141,10 @@ dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_agg.yaml
```
```
Notes:
Notes:
- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script.
- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
...
@@ -275,6 +284,9 @@ dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/deeps
...
@@ -275,6 +284,9 @@ dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/deeps
```
```
Notes:
Notes:
- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script.
- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.