Commit deb6c7e8 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat(llm): adding initial TRTLLM disaggregation support


Co-authored-by: default avatarnnshah1 <neelays@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 425be8ad
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy
import triton_python_backend_utils as pb_utils
from transformers import AutoTokenizer, T5Tokenizer
class TritonPythonModel:
"""
This model allows Triton to act like a api server for T3 ICP
"""
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = [
{"name": "query", "data_type": "TYPE_STRING", "dims": [1]},
]
outputs = [
{"name": "start_ids", "data_type": "TYPE_INT32", "dims": [-1]},
{"name": "start_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
]
# Store the model configuration as a dictionary.
config = auto_complete_model_config.as_dict()
input_names = []
output_names = []
for input in config["input"]:
input_names.append(input["name"])
for output in config["output"]:
output_names.append(output["name"])
# Add only missing inputs and output to the model configuration.
for input in inputs:
if input["name"] not in input_names:
auto_complete_model_config.add_input(input)
for output in outputs:
if output["name"] not in output_names:
auto_complete_model_config.add_output(output)
return auto_complete_model_config
def initialize(self, args):
model_config = json.loads(args["model_config"])
self.logger = pb_utils.Logger
tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
self._tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
)
if isinstance(self._tokenizer, T5Tokenizer):
self._tokenizer_bos_id = self._tokenizer.sp_model.bos_id()
if not self._tokenizer.pad_token:
self._tokenizer.pad_token = self._tokenizer.eos_token
self._tokenizer_end_id = self._tokenizer.encode(
self._tokenizer.eos_token, add_special_tokens=False
)[0]
self._tokenizer_pad_id = self._tokenizer.encode(
self._tokenizer.pad_token, add_special_tokens=False
)[0]
self._vocab_size = self._tokenizer.vocab_size
for output_name in ["start_ids", "start_lengths"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
def execute(self, requests):
responses = []
for request in requests:
query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
# Preprocessing input data.
if isinstance(self._tokenizer, T5Tokenizer):
start_ids = [
numpy.array(
[self._tokenizer_bos_id]
+ self._tokenizer.encode(
s[0].decode(), add_special_tokens=False
)
).astype(numpy.int32)
for s in query
]
else:
start_ids = [
numpy.array(
self._tokenizer.encode(s[0].decode(), add_special_tokens=False)
).astype(numpy.int32)
for s in query
]
start_lengths = numpy.array([[len(ids)] for ids in start_ids]).astype(
numpy.int32
)
max_len = 0
for seq in start_ids:
max_len = max(max_len, seq.shape[0])
start_ids = numpy.stack(
[
numpy.pad(
seq,
(0, max_len - seq.shape[0]),
"constant",
constant_values=(0, self._tokenizer_pad_id),
)
for seq in start_ids
]
)
start_ids_tensor = pb_utils.Tensor(
"start_ids", numpy.array(start_ids).astype(self.start_ids_dtype)
)
start_lengths_tensor = pb_utils.Tensor(
"start_lengths",
numpy.array(start_lengths).astype(self.start_lengths_dtype),
)
outputs = [start_ids_tensor, start_lengths_tensor]
inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# TODO: Tune dynamic batcher
max_batch_size: 1
parameters {
key: "tokenizer_dir"
value: {
string_value: "/workspace/examples/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
}
}
instance_group [
{
count: 10
kind : KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import subprocess
def get_gpu_product_name():
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = "0"
try:
result = subprocess.run(
[
"nvidia-smi",
"--query-gpu",
"name",
"--format",
"csv",
],
capture_output=True,
text=True,
env=env,
)
result_values = [
x.replace(", ", ",").split(",") for x in result.stdout.split("\n") if x
]
if result_values[0][0] == "No devices were found":
return None
return result_values[1][0].strip().replace(" ", "_")
except FileNotFoundError:
return None
def number_of_gpus():
try:
result = subprocess.run(
["nvidia-smi", "--list-gpus"], capture_output=True, text=True
)
return len(result.stdout.strip().split("\n"))
except FileNotFoundError:
return 0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
KNOWN_MODELS = {
"mock": {
"hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"download_patterns": ["*.json"],
"max_num_tokens": 2048,
"max_batch_size": 512,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
(
"/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
"context",
),
(
"/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
"generate",
),
(
"/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
"tensorrt_llm",
),
],
"template_arguments": {
"tokenizer_dir": "{args.hf_download}",
"triton_max_batch_size": "{args.max_batch_size}",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"context_token_latency_ms": "0.1",
"generate_token_latency_ms": "0.5",
},
},
"llama-3.1-70b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"download_model_name": "llama-3.1-70b-instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--max_seq_len",
"131072",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"disable",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 2048,
"max_batch_size": 512,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"enable_chunked_context": "{args.enable_chunked_context}",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3.1-8b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"download_model_name": "llama-3.1-8b-instruct",
"convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
"build": [
"--remove_input_padding",
"enable",
"--gpt_attention_plugin",
"float16",
"--context_fmha",
"enable",
"--gemm_plugin",
"float16",
"--paged_kv_cache",
"enable",
],
"max_num_tokens": 16384,
"max_batch_size": 64,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct-generate": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"max_batch_size": 256,
"model_repo_name": "llama-3-8b-instruct-disaggregated",
"download_model_name": "llama-3-8b-instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"1024",
"--use_fused_mlp",
"enable",
"--multiple_profiles",
"enable",
],
"max_num_tokens": 256,
"templates": [
("tensorrt_llm", "generate"),
"postprocessing",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct-context": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"max_batch_size": 256,
"model_repo_name": "llama-3-8b-instruct-disaggregated",
"download_model_name": "llama-3-8b-instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--multiple_profiles",
"enable",
],
"max_num_tokens": 8192,
"templates": [
"/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
"preprocessing",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "False",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--multiple_profiles",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
],
"max_num_tokens": 16384,
"max_batch_size": 512,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct-default": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"download_model_name": "llama-3-8b-instruct",
"convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
"build": [
"--remove_input_padding",
"enable",
"--gpt_attention_plugin",
"float16",
"--context_fmha",
"enable",
"--gemm_plugin",
"float16",
"--paged_kv_cache",
"enable",
],
"max_batch_size": 64,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-70b-instruct-context": {
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"download_model_name": "llama-3-70b-instruct",
"model_repo_name": "llama-3-70b-disaggegated",
"max_batch_size": 128,
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--multiple_profiles",
"enable",
],
"max_num_tokens": 8192,
"templates": [
"preprocessing",
"/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-70b-instruct-generate": {
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"download_model_name": "llama-3-70b-instruct",
"model_repo_name": "llama-3-70b-disaggegated",
"max_batch_size": 128,
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"1024",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--multiple_profiles",
"enable",
],
"max_num_tokens": 128,
"templates": [
"postprocessing",
"/workspace/examples/disaggregated_serving/tensorrtllm_templates/generate",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-70b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"max_batch_size": 512,
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--multiple_profiles",
"enable",
],
"max_num_tokens": 16384,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
}
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import shutil
import subprocess
from string import Template
from gpu_info import get_gpu_product_name
from huggingface_hub import snapshot_download
from known_models import KNOWN_MODELS
TARGET_DIR = "/workspace/examples/llm/tensorrtllm/operators"
TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples"
TENSORRTLLM_BACKEND_DIR = "/tensorrtllm_backend"
def _prepare(args):
templates = KNOWN_MODELS[args.model]["templates"]
template_arguments = KNOWN_MODELS[args.model]["template_arguments"]
model_name = (
KNOWN_MODELS[args.model]["model_repo_name"]
if "model_repo_name" in KNOWN_MODELS[args.model]
else None
)
_existing_dir(
args,
"tensorrtllm_model",
args.force_model_repo,
"model repo",
suffix=[args.hw_name, f"TP_{args.tp_size}"],
model_name=model_name,
)
for argument, value in template_arguments.items():
template_arguments[argument] = value.format(args=args)
template_arguments["request_stats_max_iterations"] = 1000
print(template_arguments)
for template in templates:
if isinstance(template, tuple):
template_basename = template[1]
template = template[0]
else:
template_basename = os.path.basename(template)
template_path = os.path.join(
TENSORRTLLM_BACKEND_DIR,
"all_models",
"inflight_batcher_llm",
template,
"config.pbtxt",
)
if template == "ensemble":
target_path = os.path.join(
args.tensorrtllm_model, args.model, "config.pbtxt"
)
else:
target_path = os.path.join(
args.tensorrtllm_model, template_basename, "config.pbtxt"
)
if not args.force_model_repo and os.path.exists(target_path):
continue
print(template_path, os.path.exists(template_path), target_path)
with open(template_path) as f:
pbtxt_template = Template(f.read())
pbtxt = pbtxt_template.safe_substitute(template_arguments)
pbtxt = pbtxt.replace(f'name: "{os.path.basename(template)}"', "")
if not args.dry_run:
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with open(target_path, "w") as f:
f.write(pbtxt)
model_asset_path = os.path.join(os.path.dirname(template_path), "1")
if os.path.exists(model_asset_path):
shutil.copytree(
model_asset_path,
os.path.join(
os.path.dirname(target_path), os.path.basename(model_asset_path)
),
)
def _call(args, command):
print(" ".join(command))
if args.dry_run:
return 0
else:
return subprocess.call(command)
def _existing_dir(args, directory_type, force, command, suffix=[], model_name=None):
model_name = args.model if model_name is None else model_name
target_dir = os.path.join(
args.target_dir, directory_type + "s", model_name, *suffix
)
setattr(args, directory_type, target_dir)
if force:
if not args.dry_run:
shutil.rmtree(target_dir, ignore_errors=True)
if os.path.exists(target_dir):
print(f"Skipping {command} Found {target_dir}")
return True
if not args.dry_run:
os.makedirs(target_dir, exist_ok=True)
return False
def _download(args):
if "hf_id" not in KNOWN_MODELS[args.model]:
print("Skipping Download")
return
if "download_patterns" in KNOWN_MODELS[args.model]:
patterns = KNOWN_MODELS[args.model]["download_patterns"]
else:
patterns = ["*.safetensors", "*.json"]
model_name = (
KNOWN_MODELS[args.model]["download_model_name"]
if "download_model_name" in KNOWN_MODELS[args.model]
else None
)
if _existing_dir(
args, "hf_download", args.force_download, "download", model_name=model_name
):
return
print(f"Downloading {KNOWN_MODELS[args.model]['hf_id']} to {args.hf_download}")
if args.dry_run:
return
snapshot_download(
KNOWN_MODELS[args.model]["hf_id"],
allow_patterns=patterns,
token=True,
local_dir=args.hf_download,
)
def _convert(args):
if "convert" not in KNOWN_MODELS[args.model]:
return
if _existing_dir(
args,
"tensorrtllm_checkpoint",
args.force_convert,
"convert",
suffix=[args.gpu_name, f"TP_{args.tp_size}"],
):
return
convert_command = ["python3"]
convert_command.extend(KNOWN_MODELS[args.model]["convert"])
convert_command[1] = os.path.join(args.tensorrtllm_example_dir, convert_command[1])
convert_command.extend(["--model_dir", "{args.hf_download}"])
convert_command.extend(["--output_dir", "{args.tensorrtllm_checkpoint}"])
convert_command.extend(["--tp_size", "{args.tp_size}"])
convert_command = [x.format(args=args) for x in convert_command]
_call(args, convert_command)
def _build(args):
if "build" not in KNOWN_MODELS[args.model]:
return
if _existing_dir(
args,
"tensorrtllm_engine",
args.force_build,
"build",
suffix=[args.gpu_name, f"TP_{args.tp_size}"],
):
return
build_command = [
"python3",
"-m",
"tensorrt_llm.commands.build",
"--checkpoint_dir",
"{args.tensorrtllm_checkpoint}",
"--output_dir",
"{args.tensorrtllm_engine}",
"--max_batch_size",
args.max_batch_size,
"--max_num_tokens",
args.max_num_tokens,
]
build_command.extend(KNOWN_MODELS[args.model]["build"])
build_command = [x.format(args=args) for x in build_command]
_call(args, build_command)
def _parse_args():
parser = argparse.ArgumentParser(description="Prepare Models")
parser.add_argument(
"--model",
type=str,
choices=list(KNOWN_MODELS.keys()),
default="llama-3.1-8b-instruct",
help="model",
)
parser.add_argument(
"--force-download",
action="store_true",
default=False,
)
parser.add_argument(
"--force-build",
action="store_true",
default=False,
)
parser.add_argument(
"--force-model-repo",
action="store_true",
default=False,
)
parser.add_argument(
"--force-convert",
action="store_true",
default=False,
)
parser.add_argument(
"--target_dir",
default=TARGET_DIR,
)
parser.add_argument(
"--tensorrtllm_example_dir",
default=TENSORRTLLM_EXAMPLE_DIR,
)
parser.add_argument("--reduce_fusion", default=None, choices=["enable", "disable"])
parser.add_argument(
"--enable_chunked_context", default="true", choices=["true", "false"]
)
parser.add_argument("--dry-run", action="store_true", default=False)
parser.add_argument("--tp-size", type=int, default=1)
parser.add_argument("--max-batch-size", type=int, default=None)
parser.add_argument("--max-num-tokens", type=int, default=None)
parser.add_argument("--postprocessing-instance-count", type=int, default=10)
parser.add_argument("--preprocessing-instance-count", type=int, default=1)
args = parser.parse_args()
args.gpu_name = get_gpu_product_name()
args.hw_name = args.gpu_name
if args.hw_name is None:
args.hw_name = "CPU"
max_batch_size = (
str(KNOWN_MODELS[args.model]["max_batch_size"])
if not args.max_batch_size
else str(args.max_batch_size)
)
args.max_batch_size = max_batch_size
max_num_tokens = (
str(KNOWN_MODELS[args.model]["max_num_tokens"])
if not args.max_num_tokens
else str(args.max_num_tokens)
)
args.max_num_tokens = max_num_tokens
args.participant_ids = ",".join([str(index) for index in range(args.tp_size)])
if args.reduce_fusion is None:
args.reduce_fusion = "enable" if args.tp_size > 1 else "disable"
# args.participant_ids = ""
return args
if __name__ == "__main__":
args = _parse_args()
print(args)
_download(args)
_convert(args)
_build(args)
_prepare(args)
print("Your models under GPU type: ", args.gpu_name)
......@@ -15,7 +15,7 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# Disaggregated Serving
# Disaggregated Serving with VLLM
This example demonstrates **disaggregated serving** [^1] using Triton Distributed together with vLLM engines. Disaggregated serving decouples the prefill (prompt encoding) and the decode (token generation) stages of large language model (LLM) inference into separate processes. This separation allows you to independently scale, optimize, and distribute resources for each stage.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment