"tools/cfgs/git@developer.sourcefind.cn:OpenDAS/openpcdet.git" did not exist on "d2d32f692cd0d5cc628c2d0399dc29dc70039417"
Commit 7dca64df authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

chore: Update TRTLLM version. Fix router. (#527)


Signed-off-by: default avatarTanmay Verma <tanmay2592@gmail.com>
Co-authored-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent 1dd20902
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
ARG BASE_IMAGE="gitlab-master.nvidia.com:5005/dl/dgx/tritonserver/tensorrt-llm/amd64" ARG BASE_IMAGE="tensorrt_llm/release"
ARG BASE_IMAGE_TAG="krish-fix-trtllm-build.23766174" ARG BASE_IMAGE_TAG="latest"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
...@@ -23,7 +23,8 @@ USER root ...@@ -23,7 +23,8 @@ USER root
# Install utilities # Install utilities
RUN apt update -y && apt install -y git wget curl nvtop tmux vim RUN apt update -y && apt install -y git wget curl nvtop tmux vim
# nats # nats
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb && rm nats-server-v2.10.24-amd64.deb RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && \
dpkg -i nats-server-v2.10.24-amd64.deb && rm nats-server-v2.10.24-amd64.deb
# etcd # etcd
ENV ETCD_VERSION="v3.5.18" ENV ETCD_VERSION="v3.5.18"
RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \ RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
# Build the TRT-LLM base image. # Build the TRT-LLM base image.
# This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM. # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
TRTLLM_COMMIT=9b931c0f6 TRTLLM_COMMIT=0d4d50a745
while getopts "c:" opt; do while getopts "c:" opt; do
case ${opt} in case ${opt} in
...@@ -29,12 +29,15 @@ done ...@@ -29,12 +29,15 @@ done
(cd /tmp && \ (cd /tmp && \
# Clone the TensorRT-LLM repository. # Clone the TensorRT-LLM repository.
if [ ! -d "TensorRT-LLM" ]; then if [ ! -d "TensorRT-LLM" ]; then
git clone https://github.com/NVIDIA/TensorRT-LLM.git git clone --single-branch --branch main https://github.com/NVIDIA/TensorRT-LLM.git
fi fi
cd TensorRT-LLM cd TensorRT-LLM
# Checkout the specified commit. # Checkout the specified commit.
# Switch to the main branch to pull the latest changes.
git checkout main
git pull
git checkout $TRTLLM_COMMIT git checkout $TRTLLM_COMMIT
# Update the submodules. # Update the submodules.
......
...@@ -44,6 +44,7 @@ class DynamoTRTLLMCompletionRequest(CompletionRequest): ...@@ -44,6 +44,7 @@ class DynamoTRTLLMCompletionRequest(CompletionRequest):
class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest): class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
id: str = Field(default_factory=lambda: f"chatcmpl-{str(uuid.uuid4().hex)}") id: str = Field(default_factory=lambda: f"chatcmpl-{str(uuid.uuid4().hex)}")
max_completion_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None
max_tokens: Optional[int] = None
disaggregated_params: Optional[DisaggregatedParams] = Field(default=None) disaggregated_params: Optional[DisaggregatedParams] = Field(default=None)
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import asyncio
import signal import signal
from dataclasses import asdict from dataclasses import asdict
...@@ -74,11 +75,13 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine): ...@@ -74,11 +75,13 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine):
@async_on_start @async_on_start
async def async_init(self): async def async_init(self):
super().__init__(self.trtllm_engine_args) super().__init__(self.trtllm_engine_args)
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(lambda _: print("metrics publisher endpoint created"))
print("TensorRT-LLM Worker initialized") print("TensorRT-LLM Worker initialized")
async def create_metrics_publisher_endpoint(self): async def create_metrics_publisher_endpoint(self):
component = dynamo_context["component"] component = dynamo_context["component"]
await self.metrics_publisher.create_endpoint(component) await self.trtllm_engine_args.kv_metrics_publisher.create_endpoint(component)
@dynamo_endpoint() @dynamo_endpoint()
async def generate(self, request: TRTLLMWorkerRequest): async def generate(self, request: TRTLLMWorkerRequest):
......
...@@ -129,6 +129,16 @@ class Processor(ChatProcessorMixin): ...@@ -129,6 +129,16 @@ class Processor(ChatProcessorMixin):
@dynamo_endpoint(name="chat/completions") @dynamo_endpoint(name="chat/completions")
async def generate_chat(self, raw_request: DynamoTRTLLMChatCompletionRequest): async def generate_chat(self, raw_request: DynamoTRTLLMChatCompletionRequest):
# max_tokens is deprecated, however if the max_tokens is provided instead
# of max_completion_tokens, we will use the value as max_completion_tokens.
if raw_request.max_tokens is not None:
if raw_request.max_completion_tokens is None:
raw_request.max_completion_tokens = raw_request.max_tokens
else:
if raw_request.max_tokens != raw_request.max_completion_tokens:
raise ValueError(
"max_tokens and max_completion_tokens must be the same"
)
async for response in self._generate(raw_request, RequestType.CHAT): async for response in self._generate(raw_request, RequestType.CHAT):
yield response yield response
......
...@@ -23,6 +23,10 @@ Processor: ...@@ -23,6 +23,10 @@ Processor:
block-size: 64 block-size: 64
router: kv router: kv
Router:
model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
min-workers: 1
TensorRTLLMWorker: TensorRTLLMWorker:
engine_args: "configs/llm_api_config.yaml" engine_args: "configs/llm_api_config.yaml"
router: kv router: kv
......
...@@ -96,8 +96,6 @@ def init_global_engine(args, engine_config): ...@@ -96,8 +96,6 @@ def init_global_engine(args, engine_config):
DynamoTRTLLMChatCompletionRequest, DynamoTRTLLMChatCompletionStreamResponse DynamoTRTLLMChatCompletionRequest, DynamoTRTLLMChatCompletionStreamResponse
) )
async def generate(request): async def generate(request):
if request.max_completion_tokens is not None:
request.max_tokens = request.max_completion_tokens
async for response in chat_generator(engine, request): async for response in chat_generator(engine, request):
yield response yield response
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment