Commit 7dca64df authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

chore: Update TRTLLM version. Fix router. (#527)


Signed-off-by: default avatarTanmay Verma <tanmay2592@gmail.com>
Co-authored-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent 1dd20902
......@@ -13,8 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
ARG BASE_IMAGE="gitlab-master.nvidia.com:5005/dl/dgx/tritonserver/tensorrt-llm/amd64"
ARG BASE_IMAGE_TAG="krish-fix-trtllm-build.23766174"
ARG BASE_IMAGE="tensorrt_llm/release"
ARG BASE_IMAGE_TAG="latest"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
......@@ -23,7 +23,8 @@ USER root
# Install utilities
RUN apt update -y && apt install -y git wget curl nvtop tmux vim
# nats
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb && rm nats-server-v2.10.24-amd64.deb
RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && \
dpkg -i nats-server-v2.10.24-amd64.deb && rm nats-server-v2.10.24-amd64.deb
# etcd
ENV ETCD_VERSION="v3.5.18"
RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
......
......@@ -17,7 +17,7 @@
# Build the TRT-LLM base image.
# This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
TRTLLM_COMMIT=9b931c0f6
TRTLLM_COMMIT=0d4d50a745
while getopts "c:" opt; do
case ${opt} in
......@@ -29,12 +29,15 @@ done
(cd /tmp && \
# Clone the TensorRT-LLM repository.
if [ ! -d "TensorRT-LLM" ]; then
git clone https://github.com/NVIDIA/TensorRT-LLM.git
git clone --single-branch --branch main https://github.com/NVIDIA/TensorRT-LLM.git
fi
cd TensorRT-LLM
# Checkout the specified commit.
# Switch to the main branch to pull the latest changes.
git checkout main
git pull
git checkout $TRTLLM_COMMIT
# Update the submodules.
......
......@@ -44,6 +44,7 @@ class DynamoTRTLLMCompletionRequest(CompletionRequest):
class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
id: str = Field(default_factory=lambda: f"chatcmpl-{str(uuid.uuid4().hex)}")
max_completion_tokens: Optional[int] = None
max_tokens: Optional[int] = None
disaggregated_params: Optional[DisaggregatedParams] = Field(default=None)
......
......@@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import signal
from dataclasses import asdict
......@@ -74,11 +75,13 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine):
@async_on_start
async def async_init(self):
super().__init__(self.trtllm_engine_args)
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(lambda _: print("metrics publisher endpoint created"))
print("TensorRT-LLM Worker initialized")
async def create_metrics_publisher_endpoint(self):
component = dynamo_context["component"]
await self.metrics_publisher.create_endpoint(component)
await self.trtllm_engine_args.kv_metrics_publisher.create_endpoint(component)
@dynamo_endpoint()
async def generate(self, request: TRTLLMWorkerRequest):
......
......@@ -129,6 +129,16 @@ class Processor(ChatProcessorMixin):
@dynamo_endpoint(name="chat/completions")
async def generate_chat(self, raw_request: DynamoTRTLLMChatCompletionRequest):
# max_tokens is deprecated, however if the max_tokens is provided instead
# of max_completion_tokens, we will use the value as max_completion_tokens.
if raw_request.max_tokens is not None:
if raw_request.max_completion_tokens is None:
raw_request.max_completion_tokens = raw_request.max_tokens
else:
if raw_request.max_tokens != raw_request.max_completion_tokens:
raise ValueError(
"max_tokens and max_completion_tokens must be the same"
)
async for response in self._generate(raw_request, RequestType.CHAT):
yield response
......
......@@ -23,6 +23,10 @@ Processor:
block-size: 64
router: kv
Router:
model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
min-workers: 1
TensorRTLLMWorker:
engine_args: "configs/llm_api_config.yaml"
router: kv
......
......@@ -96,8 +96,6 @@ def init_global_engine(args, engine_config):
DynamoTRTLLMChatCompletionRequest, DynamoTRTLLMChatCompletionStreamResponse
)
async def generate(request):
if request.max_completion_tokens is not None:
request.max_tokens = request.max_completion_tokens
async for response in chat_generator(engine, request):
yield response
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment