Commit 0bfd9a76 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

refactor: remove python native runtime

parent 8f741f14
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import triton_python_backend_utils as pb_utils
from transformers import AutoTokenizer
class TritonPythonModel:
"""
This model allows Triton to act like a api server for T3 ICP
"""
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = [
{"name": "tokens_batch", "data_type": "TYPE_INT32", "dims": [-1, -1]},
{"name": "sequence_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
]
outputs = [
{"name": "output", "data_type": "TYPE_STRING", "dims": [-1]},
]
# Store the model configuration as a dictionary.
config = auto_complete_model_config.as_dict()
input_names = []
output_names = []
for input in config["input"]:
input_names.append(input["name"])
for output in config["output"]:
output_names.append(output["name"])
# Add only missing inputs and output to the model configuration.
for input in inputs:
if input["name"] not in input_names:
auto_complete_model_config.add_input(input)
for output in outputs:
if output["name"] not in output_names:
auto_complete_model_config.add_output(output)
return auto_complete_model_config
def initialize(self, args):
model_config = json.loads(args["model_config"])
self.logger = pb_utils.Logger
# Parse model configs
model_config = json.loads(args["model_config"])
tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
skip_special_tokens = model_config["parameters"].get("skip_special_tokens")
if skip_special_tokens is not None:
skip_special_tokens_str = skip_special_tokens["string_value"].lower()
if skip_special_tokens_str in [
"true",
"false",
"1",
"0",
"t",
"f",
"y",
"n",
"yes",
"no",
]:
self.skip_special_tokens = skip_special_tokens_str in [
"true",
"1",
"t",
"y",
"yes",
]
else:
self.logger.log_warn(
f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
)
self.skip_special_tokens = True
else:
self.logger.log_warn(
"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
)
self.skip_special_tokens = True
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
)
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token
for output_name in ["output"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
def execute(self, requests):
tokens_batch = []
sequence_lengths = []
for idx, request in enumerate(requests):
for input_tensor in request.inputs():
if input_tensor.name() == "tokens_batch":
tokens_batch.append(input_tensor.as_numpy())
elif input_tensor.name() == "sequence_lengths":
sequence_lengths.append(input_tensor.as_numpy())
else:
raise ValueError(f"unknown input {input_tensor.name}")
# batch decode
list_of_tokens = []
req_idx_offset = 0
req_idx_offsets = [req_idx_offset]
for idx, token_batch in enumerate(tokens_batch):
for batch_idx, beam_tokens in enumerate(token_batch):
for beam_idx, tokens in enumerate(beam_tokens):
seq_len = sequence_lengths[idx][batch_idx][beam_idx]
list_of_tokens.append(tokens[:seq_len])
req_idx_offset += 1
req_idx_offsets.append(req_idx_offset)
all_outputs = self.tokenizer.batch_decode(
list_of_tokens, skip_special_tokens=self.skip_special_tokens
)
# construct responses
responses = []
for idx, request in enumerate(requests):
req_outputs = [
x.encode("utf8")
for x in all_outputs[req_idx_offsets[idx] : req_idx_offsets[idx + 1]]
]
output_tensor = pb_utils.Tensor(
"output", np.array(req_outputs).astype(self.output_dtype)
)
outputs = [output_tensor]
inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
responses.append(inference_response)
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print("Cleaning up...")
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# TODO: Tune dynamic batcher
max_batch_size: 64
dynamic_batching {}
parameters {
key: "tokenizer_dir"
value: {
string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
}
}
#parameters {
# key: "skip_special_tokens"
# value: {
# string_value: "${skip_special_tokens}"
# }
#}
instance_group [
{
count: 10
kind : KIND_CPU
}
]
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy
import triton_python_backend_utils as pb_utils
from transformers import AutoTokenizer, T5Tokenizer
class TritonPythonModel:
"""
This model allows Triton to act like a api server for T3 ICP
"""
@staticmethod
def auto_complete_config(auto_complete_model_config):
inputs = [
{"name": "query", "data_type": "TYPE_STRING", "dims": [1]},
]
outputs = [
{"name": "start_ids", "data_type": "TYPE_INT32", "dims": [-1]},
{"name": "start_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
]
# Store the model configuration as a dictionary.
config = auto_complete_model_config.as_dict()
input_names = []
output_names = []
for input in config["input"]:
input_names.append(input["name"])
for output in config["output"]:
output_names.append(output["name"])
# Add only missing inputs and output to the model configuration.
for input in inputs:
if input["name"] not in input_names:
auto_complete_model_config.add_input(input)
for output in outputs:
if output["name"] not in output_names:
auto_complete_model_config.add_output(output)
return auto_complete_model_config
def initialize(self, args):
model_config = json.loads(args["model_config"])
self.logger = pb_utils.Logger
tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
self._tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
)
if isinstance(self._tokenizer, T5Tokenizer):
self._tokenizer_bos_id = self._tokenizer.sp_model.bos_id()
if not self._tokenizer.pad_token:
self._tokenizer.pad_token = self._tokenizer.eos_token
self._tokenizer_end_id = self._tokenizer.encode(
self._tokenizer.eos_token, add_special_tokens=False
)[0]
self._tokenizer_pad_id = self._tokenizer.encode(
self._tokenizer.pad_token, add_special_tokens=False
)[0]
self._vocab_size = self._tokenizer.vocab_size
for output_name in ["start_ids", "start_lengths"]:
setattr(
self,
output_name.lower() + "_dtype",
pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(model_config, output_name)[
"data_type"
]
),
)
def execute(self, requests):
responses = []
for request in requests:
query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
# Preprocessing input data.
if isinstance(self._tokenizer, T5Tokenizer):
start_ids = [
numpy.array(
[self._tokenizer_bos_id]
+ self._tokenizer.encode(
s[0].decode(), add_special_tokens=False
)
).astype(numpy.int32)
for s in query
]
else:
start_ids = [
numpy.array(
self._tokenizer.encode(s[0].decode(), add_special_tokens=False)
).astype(numpy.int32)
for s in query
]
start_lengths = numpy.array([[len(ids)] for ids in start_ids]).astype(
numpy.int32
)
max_len = 0
for seq in start_ids:
max_len = max(max_len, seq.shape[0])
start_ids = numpy.stack(
[
numpy.pad(
seq,
(0, max_len - seq.shape[0]),
"constant",
constant_values=(0, self._tokenizer_pad_id),
)
for seq in start_ids
]
)
start_ids_tensor = pb_utils.Tensor(
"start_ids", numpy.array(start_ids).astype(self.start_ids_dtype)
)
start_lengths_tensor = pb_utils.Tensor(
"start_lengths",
numpy.array(start_lengths).astype(self.start_lengths_dtype),
)
outputs = [start_ids_tensor, start_lengths_tensor]
inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
responses.append(inference_response)
return responses
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
backend: "python"
# TODO: Tune dynamic batcher
max_batch_size: 1
parameters {
key: "tokenizer_dir"
value: {
string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
}
}
instance_group [
{
count: 10
kind : KIND_CPU
}
]
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import subprocess
def get_gpu_product_name():
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = "0"
try:
result = subprocess.run(
[
"nvidia-smi",
"--query-gpu",
"name",
"--format",
"csv",
],
capture_output=True,
text=True,
env=env,
)
result_values = [
x.replace(", ", ",").split(",") for x in result.stdout.split("\n") if x
]
if result_values[0][0] == "No devices were found":
return None
return result_values[1][0].strip().replace(" ", "_")
except FileNotFoundError:
return None
def number_of_gpus():
try:
result = subprocess.run(
["nvidia-smi", "--list-gpus"], capture_output=True, text=True
)
return len(result.stdout.strip().split("\n"))
except FileNotFoundError:
return 0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
KNOWN_MODELS = {
"mock": {
"hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"download_patterns": ["*.json"],
"max_num_tokens": 2048,
"max_batch_size": 512,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
(
"/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
"context",
),
(
"/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
"generate",
),
(
"/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
"tensorrt_llm",
),
],
"template_arguments": {
"tokenizer_dir": "{args.hf_download}",
"triton_max_batch_size": "{args.max_batch_size}",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"context_token_latency_ms": "0.1",
"generate_token_latency_ms": "0.5",
},
},
"llama-3.1-70b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"download_model_name": "llama-3.1-70b-instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--max_seq_len",
"131072",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"disable",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 2048,
"max_batch_size": 512,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"enable_chunked_context": "{args.enable_chunked_context}",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3.1-8b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"download_model_name": "llama-3.1-8b-instruct",
"convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
"build": [
"--remove_input_padding",
"enable",
"--gpt_attention_plugin",
"float16",
"--context_fmha",
"enable",
"--gemm_plugin",
"float16",
"--paged_kv_cache",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 16384,
"max_batch_size": 64,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct-generate": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"max_batch_size": 256,
"model_repo_name": "llama-3-8b-instruct-disaggregated",
"download_model_name": "llama-3-8b-instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"1024",
"--use_fused_mlp",
"enable",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 256,
"templates": [
("tensorrt_llm", "generate"),
"postprocessing",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct-context": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"max_batch_size": 256,
"model_repo_name": "llama-3-8b-instruct-disaggregated",
"download_model_name": "llama-3-8b-instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 8192,
"templates": [
"/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
"preprocessing",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "False",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--multiple_profiles",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 16384,
"max_batch_size": 512,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-8b-instruct-default": {
"hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"download_model_name": "llama-3-8b-instruct",
"convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
"build": [
"--remove_input_padding",
"enable",
"--gpt_attention_plugin",
"float16",
"--context_fmha",
"enable",
"--gemm_plugin",
"float16",
"--paged_kv_cache",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_batch_size": 64,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
("tensorrt_llm", "context"),
("tensorrt_llm", "generate"),
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "0",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-70b-instruct-context": {
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"download_model_name": "llama-3-70b-instruct",
"model_repo_name": "llama-3-70b-disaggegated",
"max_batch_size": 128,
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 8192,
"templates": [
"preprocessing",
"/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-70b-instruct-generate": {
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"download_model_name": "llama-3-70b-instruct",
"model_repo_name": "llama-3-70b-disaggegated",
"max_batch_size": 128,
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"1024",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 128,
"templates": [
"postprocessing",
"/workspace/examples/disaggregated_serving/tensorrtllm_templates/generate",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
"llama-3-70b-instruct": {
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"max_batch_size": 512,
"convert": [
"quantization/quantize.py",
"--dtype",
"float16",
"--qformat",
"fp8",
"--calib_size",
"512",
"--kv_cache_dtype",
"fp8",
],
"build": [
"--gpt_attention_plugin",
"float16",
"--workers",
"{args.tp_size}",
"--max_seq_len",
"8192",
"--use_fused_mlp",
"enable",
"--reduce_fusion",
"{args.reduce_fusion}",
"--multiple_profiles",
"enable",
"--use_paged_context_fmha",
"enable",
],
"max_num_tokens": 16384,
"templates": [
"preprocessing",
"postprocessing",
"ensemble",
"tensorrt_llm",
],
"template_arguments": {
"triton_max_batch_size": "{args.max_batch_size}",
"decoupled_mode": "True",
"preprocessing_instance_count": "{args.preprocessing_instance_count}",
"postprocessing_instance_count": "{args.postprocessing_instance_count}",
"triton_backend": "tensorrtllm",
"max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True",
"enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0",
"max_queue_size": "0",
"participant_ids": "{args.participant_ids}",
"tokenizer_dir": "{args.hf_download}",
"encoder_input_features_data_type": "TYPE_FP16",
},
},
}
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import shutil
import subprocess
from string import Template
from gpu_info import get_gpu_product_name
from huggingface_hub import snapshot_download
from known_models import KNOWN_MODELS
TARGET_DIR = "/workspace/examples/python/llm/tensorrtllm/operators"
TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples"
TENSORRTLLM_BACKEND_DIR = "/tensorrtllm_backend"
def _prepare(args):
templates = KNOWN_MODELS[args.model]["templates"]
template_arguments = KNOWN_MODELS[args.model]["template_arguments"]
model_name = (
KNOWN_MODELS[args.model]["model_repo_name"]
if "model_repo_name" in KNOWN_MODELS[args.model]
else None
)
_existing_dir(
args,
"tensorrtllm_model",
args.force_model_repo,
"model repo",
suffix=[args.hw_name, f"TP_{args.tp_size}"],
model_name=model_name,
)
for argument, value in template_arguments.items():
template_arguments[argument] = value.format(args=args)
template_arguments["request_stats_max_iterations"] = 1000
print(template_arguments)
for template in templates:
if isinstance(template, tuple):
template_basename = template[1]
template = template[0]
else:
template_basename = os.path.basename(template)
template_path = os.path.join(
TENSORRTLLM_BACKEND_DIR,
"all_models",
"inflight_batcher_llm",
template,
"config.pbtxt",
)
if template == "ensemble":
target_path = os.path.join(
args.tensorrtllm_model, args.model, "config.pbtxt"
)
else:
target_path = os.path.join(
args.tensorrtllm_model, template_basename, "config.pbtxt"
)
if not args.force_model_repo and os.path.exists(target_path):
continue
print(template_path, os.path.exists(template_path), target_path)
with open(template_path) as f:
pbtxt_template = Template(f.read())
pbtxt = pbtxt_template.safe_substitute(template_arguments)
pbtxt = pbtxt.replace(f'name: "{os.path.basename(template)}"', "")
if not args.dry_run:
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with open(target_path, "w") as f:
f.write(pbtxt)
model_asset_path = os.path.join(os.path.dirname(template_path), "1")
if os.path.exists(model_asset_path):
shutil.copytree(
model_asset_path,
os.path.join(
os.path.dirname(target_path), os.path.basename(model_asset_path)
),
)
def _call(args, command):
print(" ".join(command))
if args.dry_run:
return 0
else:
return subprocess.call(command)
def _existing_dir(args, directory_type, force, command, suffix=[], model_name=None):
model_name = args.model if model_name is None else model_name
target_dir = os.path.join(
args.target_dir, directory_type + "s", model_name, *suffix
)
setattr(args, directory_type, target_dir)
if force:
if not args.dry_run:
shutil.rmtree(target_dir, ignore_errors=True)
if os.path.exists(target_dir):
print(f"Skipping {command} Found {target_dir}")
return True
if not args.dry_run:
os.makedirs(target_dir, exist_ok=True)
return False
def _download(args):
if "hf_id" not in KNOWN_MODELS[args.model]:
print("Skipping Download")
return
if "download_patterns" in KNOWN_MODELS[args.model]:
patterns = KNOWN_MODELS[args.model]["download_patterns"]
else:
patterns = ["*.safetensors", "*.json"]
model_name = (
KNOWN_MODELS[args.model]["download_model_name"]
if "download_model_name" in KNOWN_MODELS[args.model]
else None
)
if _existing_dir(
args, "hf_download", args.force_download, "download", model_name=model_name
):
return
print(f"Downloading {KNOWN_MODELS[args.model]['hf_id']} to {args.hf_download}")
if args.dry_run:
return
snapshot_download(
KNOWN_MODELS[args.model]["hf_id"],
allow_patterns=patterns,
token=True,
local_dir=args.hf_download,
)
def _convert(args):
if "convert" not in KNOWN_MODELS[args.model]:
return
if _existing_dir(
args,
"tensorrtllm_checkpoint",
args.force_convert,
"convert",
suffix=[args.gpu_name, f"TP_{args.tp_size}"],
):
return
convert_command = ["python3"]
convert_command.extend(KNOWN_MODELS[args.model]["convert"])
convert_command[1] = os.path.join(args.tensorrtllm_example_dir, convert_command[1])
convert_command.extend(["--model_dir", "{args.hf_download}"])
convert_command.extend(["--output_dir", "{args.tensorrtllm_checkpoint}"])
convert_command.extend(["--tp_size", "{args.tp_size}"])
convert_command = [x.format(args=args) for x in convert_command]
_call(args, convert_command)
def _build(args):
if "build" not in KNOWN_MODELS[args.model]:
return
if _existing_dir(
args,
"tensorrtllm_engine",
args.force_build,
"build",
suffix=[args.gpu_name, f"TP_{args.tp_size}"],
):
return
build_command = [
"python3",
"-m",
"tensorrt_llm.commands.build",
"--checkpoint_dir",
"{args.tensorrtllm_checkpoint}",
"--output_dir",
"{args.tensorrtllm_engine}",
"--max_batch_size",
args.max_batch_size,
"--max_num_tokens",
args.max_num_tokens,
]
build_command.extend(KNOWN_MODELS[args.model]["build"])
build_command = [x.format(args=args) for x in build_command]
_call(args, build_command)
def _parse_args():
parser = argparse.ArgumentParser(description="Prepare Models")
parser.add_argument(
"--model",
type=str,
choices=list(KNOWN_MODELS.keys()),
default="llama-3.1-8b-instruct",
help="model",
)
parser.add_argument(
"--force-download",
action="store_true",
default=False,
)
parser.add_argument(
"--force-build",
action="store_true",
default=False,
)
parser.add_argument(
"--force-model-repo",
action="store_true",
default=False,
)
parser.add_argument(
"--force-convert",
action="store_true",
default=False,
)
parser.add_argument(
"--target_dir",
default=TARGET_DIR,
)
parser.add_argument(
"--tensorrtllm_example_dir",
default=TENSORRTLLM_EXAMPLE_DIR,
)
parser.add_argument("--reduce_fusion", default=None, choices=["enable", "disable"])
parser.add_argument(
"--enable_chunked_context", default="true", choices=["true", "false"]
)
parser.add_argument("--dry-run", action="store_true", default=False)
parser.add_argument("--tp-size", type=int, default=1)
parser.add_argument("--max-batch-size", type=int, default=None)
parser.add_argument("--max-num-tokens", type=int, default=None)
parser.add_argument("--postprocessing-instance-count", type=int, default=10)
parser.add_argument("--preprocessing-instance-count", type=int, default=1)
args = parser.parse_args()
args.gpu_name = get_gpu_product_name()
args.hw_name = args.gpu_name
if args.hw_name is None:
args.hw_name = "CPU"
max_batch_size = (
str(KNOWN_MODELS[args.model]["max_batch_size"])
if not args.max_batch_size
else str(args.max_batch_size)
)
args.max_batch_size = max_batch_size
max_num_tokens = (
str(KNOWN_MODELS[args.model]["max_num_tokens"])
if not args.max_num_tokens
else str(args.max_num_tokens)
)
args.max_num_tokens = max_num_tokens
args.participant_ids = ",".join([str(index) for index in range(args.tp_size)])
if args.reduce_fusion is None:
args.reduce_fusion = "enable" if args.tp_size > 1 else "disable"
# args.participant_ids = ""
return args
if __name__ == "__main__":
args = _parse_args()
print(args)
_download(args)
_convert(args)
_build(args)
_prepare(args)
print("Your models under GPU type: ", args.gpu_name)
<!--
SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Disaggregated Serving with VLLM
> **Warning**
> This example is currently not tested and might not work as expected. For working disaggregated serving examples, please see the [vLLM example](/examples/python_rs/llm/vllm/).
This example demonstrates **disaggregated serving** [^1] using Triton Distributed together with vLLM engines. Disaggregated serving decouples the prefill (prompt encoding) and the decode (token generation) stages of large language model (LLM) inference into separate processes. This separation allows you to independently scale, optimize, and distribute resources for each stage.
In this example, you will deploy:
- An **OpenAI-compatible API server** (which receives requests and streams responses).
- One or more **prefill workers** (for encoding the prompt).
- One or more **decode workers** (for generating tokens based on the encoded prompt).
![Overview of disaggregated serving deployment architecture](assets/vllm_disagg_architecture_overview.jpg)
For more details on the basics of Triton Distributed, please see the [Hello World example](../../hello_world/).
---
## 1. Prerequisites
1. **GPU Availability**
This setup requires at least two GPUs:
- One GPU is typically used by the **prefill** process.
- Another GPU is used by the **decode** process.
In production systems with heavier loads, you will typically allocate more GPUs across multiple prefill and decode workers.
2. **NATS or Another Coordination Service**
Triton Distributed uses NATS by default for coordination and message passing. Make sure your environment has a running NATS service accessible via a valid `nats://<address>:<port>` endpoint. By default, examples assume `nats://localhost:4223`.
3. **vLLM Patch**
This example requires some features that are not yet in the main vLLM release. A patch is automatically applied inside the provided container. Details of the patch can be found [here](../../../container/deps/vllm/). The current patch is compatible with **vLLM 0.6.3post1**.
4. **Supported GPUs**
- For FP8 usage, GPUs with **Compute Capability >= 8.9** are required.
- If you have older GPUs, consider BF16/FP16 precision variants instead of `FP8`. (See [below](#model-precision-variants).)
5. **HuggingFace**
- You need a HuggingFace account to download the model and set HF_TOKEN environment variable.
---
## 2. Building the Environment
The example is designed to run in a containerized environment using Triton Distributed, vLLM, and associated dependencies. To build the container:
```bash
./container/build.sh --framework vllm
```
This command pulls necessary dependencies and patches vLLM in the container image.
---
## 3. Starting the Deployment
Below is a minimal example of how to start each component of a disaggregated serving setup. The typical sequence is:
2. **Start the Context Worker(s) and Request Plane**
3. **Start the Generate Worker(s)**
1. **Start the API Server** (handles incoming requests and coordinates workers)
All components must be able to connect to the same request plane to coordinate.
### 3.1 HuggingFace Token
```bash
export HF_TOKEN=<YOUR TOKEN>
```
### 3.2 Launch Interactive Environment
```bash
./container/run.sh --framework vllm -it
```
Note: all subsequent commands will be run in the same container for simplicity
Note: by default this command makes all gpu devices visible. Use flag
```
--gpus
```
to selectively make gpu devices visible.
### 3.2 Launch Context Worker and Request Plane
The context stage encodes incoming prompts. By default, vLLM uses GPU resources to tokenize and prepare the model’s key-value (KV) caches.
Within the container start the context worker and the request plane:
```
CUDA_VISIBLE_DEVICES=0 \
VLLM_WORKER_ID=0 \
python3 -m llm.vllm.deploy \
--context-worker-count 1 \
--request-plane-uri ${HOSTNAME}:4223 \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--worker-name llama \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 3500 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--context-tp-size 1 \
--generate-tp-size 1 \
--initialize-request-plane &
```
**Key flags**:
- `--context-worker-count`: Launches only context (prefill) workers.
- `--kv-cache-dtype fp8`: Using FP8 for caching (requires CC >= 8.9).
- `CUDA_VISIBLE_DEVICES=0`: Binds worker to GPU `0`.
#### Expected Output
```
<SNIP>
Workers started ... press Ctrl-C to Exit
[168] 2025/01/24 09:17:38.879908 [INF] Starting nats-server
[168] 2025/01/24 09:17:38.879982 [INF] Version: 2.10.24
[168] 2025/01/24 09:17:38.879987 [INF] Git: [1d6f7ea]
[168] 2025/01/24 09:17:38.879989 [INF] Name: NDBCCXARM6D2BMMRJOKZCJD4TGVXXPCJKQRXALJOPHLA5W7ISCW4VHU5
[168] 2025/01/24 09:17:38.879992 [INF] Node: S4g51H7K
[168] 2025/01/24 09:17:38.879995 [INF] ID: NDBCCXARM6D2BMMRJOKZCJD4TGVXXPCJKQRXALJOPHLA5W7ISCW4VHU5
[168] 2025/01/24 09:17:38.880339 [INF] Starting JetStream
<SNIP>
INFO 01-24 09:17:49 parallel_state.py:942] Stage: PREFILL
```
### 3.3 Launch Generate (Decode) Worker
The generate stage consumes the KV cache produced in the context step and generates output tokens.
Within the container start the generate worker:
```bash
CUDA_VISIBLE_DEVICES=1 \
VLLM_WORKER_ID=1 \
python3 -m llm.vllm.deploy \
--generate-worker-count 1 \
--request-plane-uri ${HOSTNAME}:4223 \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--worker-name llama \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 3500 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--context-tp-size 1 \
--generate-tp-size 1 &
```
> [!NOTE]
> - First time running in a newly launched container will
> include model download. Please wait until you see the
> llama handler started before sending requests
**Key flags**:
- `--generate-worker-count`: Launches decode worker(s).
- `CUDA_VISIBLE_DEVICES=1`: Binds worker to GPU `1`.
#### Expected Output
```
<SNIP>x
model-00002-of-00002.safetensors: 100% 4.08G/4.08G [01:36<00:00, 42.2MB/s]
model-00001-of-00002.safetensors: 100%% 4.71G/5.00G [01:51<00:06, 41.9MB/s]
<SNIP>
INFO 01-24 09:21:22 model_runner.py:1406] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-24 09:21:22 model_runner.py:1410] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
<SNIP>
09:22:10 worker.py:266[Triton Worker] INFO: Worker started...
09:22:10 worker.py:241[Triton Worker] INFO: Starting generate handler...
09:22:10 worker.py:266[Triton Worker] INFO: Worker started...
09:22:10 worker.py:241[Triton Worker] INFO: Starting llama handler...
```
> [!NOTE]
> - You can run multiple prefill and decode workers for higher throughput.
> - For large models, ensure you have enough GPU memory (or GPUs).
### 3.4 API Server
The API server in a vLLM-disaggregated setup listens for OpenAI-compatible requests on a chosen port (default 8005). Below is an example command:
```bash
python3 -m llm.api_server \
--tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--request-plane-uri ${HOSTNAME}:4223 \
--api-server-host ${HOSTNAME} \
--model-name llama \
--api-server-port 8005 &
```
#### Expected Output
```
[WARNING] Adding CORS for the following origins: ['http://localhost']
INFO: Started server process [498]
INFO: Waiting for application startup.
TRACE: ASGI [1] Started scope={'type': 'lifespan', 'asgi': {'version': '3.0', 'spec_version': '2.0'}, 'state': {}}
TRACE: ASGI [1] Receive {'type': 'lifespan.startup'}
TRACE: ASGI [1] Send {'type': 'lifespan.startup.complete'}
INFO: Application startup complete.
INFO: Uvicorn running on http://2u2g-gen-0349:8005 (Press CTRL+C to quit)
```
## 4. Sending Requests
Once the API server is running (by default on `localhost:8005`), you can send OpenAI-compatible requests. For example:
```bash
curl ${HOSTNAME}:8005/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama",
"messages": [
{"role": "user", "content": "What is the capital of France?"}
],
"temperature": 0,
"top_p": 0.95,
"max_tokens": 25,
"stream": true,
"n": 1,
"frequency_penalty": 0.0,
"stop": []
}'
```
The above request will return a streamed response with the model’s answer.
#### Expected Output
```
INFO 01-24 09:33:05 async_llm_engine.py:207] Added request 052eabe0-fc54-4f7c-9be8-4926523b26fc___0.
INFO 01-24 09:33:05 kv_cache.py:378] Fetching source address for worker 0 by key worker_0_rank_0
TRACE: 127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<290 bytes>', 'more_body': True}
data: {"id":"052eabe0-fc54-4f7c-9be8-4926523b26fc","choices":[{"delta":{"content":"\n\n","role":"assistant"},"logprobs":null,"finish_reason":null,"index":0}],"created":1737711185,"model":"llama","system_fingerprint":"052eabe0-fc54-4f7c-9be8-4926523b26fc","object":"chat.completion.chunk"}
INFO 01-24 09:33:05 async_llm_engine.py:175] Finished request 052eabe0-fc54-4f7c-9be8-4926523b26fc___0.
TRACE: 127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<317 bytes>', 'more_body': True}
TRACE: 127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<14 bytes>', 'more_body': True}
TRACE: 127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<0 bytes>', 'more_body': False}
data: {"id":"052eabe0-fc54-4f7c-9be8-4926523b26fc","choices":[{"delta":{"content":"The capital of France is Paris.","role":"assistant"},"logprobs":null,"finish_reason":null,"index":0}],"created":1737711185,"model":"llama","system_fingerprint":"052eabe0-fc54-4f7c-9be8-4926523b26fc","object":"chat.completion.chunk"}
TRACE: 127.0.0.1:49878 - ASGI [2] Receive {'type': 'http.disconnect'}
data: [DONE]
```
## 5. Benchmarking
You can benchmark this setup using [**GenAI-Perf**](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/README.md), which supports OpenAI endpoints for chat or completion requests.
```bash
genai-perf profile \
-m llama \
--url ${HOSTNAME}:8005 \
--endpoint-type chat \
--streaming \
--num-dataset-entries 1000 \
--service-kind openai \
--endpoint v1/chat/completions \
--warmup-request-count 10 \
--random-seed 123 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-stddev 0 \
--tokenizer neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
--synthetic-input-tokens-mean 300 \
--output-tokens-mean 3000 \
--extra-inputs seed:100 \
--extra-inputs min_tokens:150 \
--extra-inputs max_tokens:150 \
--profile-export-file my_profile_export.json \
--artifact-dir artifacts/ \
--concurrency 32 \
--request-count 320 \
-- -v \
--async
```
**Key Parameters**:
- **`-m llama`**: Your model name (must match the name used in your server).
- **`--url <API_SERVER_HOST>:8005`**: The location of your API server.
- **`--endpoint v1/chat/completions`**: Using the OpenAI chat endpoint.
- **`--streaming`**: Ensures tokens are streamed back for chat-like usage.
## 6. Teardown
To tear down a deployment during local development, you can either kill the
container or the kill the relevant processes involved in the deployment.
To kill the processes being run inside the container, you can run:
```bash
pkill -9 -f python3
pkill -9 -f nats-server
```
You will generally want to make sure you have a clean slate between
deployments to avoid any unexpected errors.
NOTE: If you have other unrelated processes in the environment with `python3`
in the name, the `pkill` command above will terminate them as well. In this
scenario, you could select specific process IDs and use the following command
instead for each process ID replacing `<pid>` below:
```
kill -9 <pid>
```
## 7. Model Precision Variants
In the commands above, we used the FP8 variant `neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8` because it significantly reduces KV cache size, which helps with network transfer and memory usage. However, if your GPU is older or does not support FP8, try using the standard BF16/FP16 precision variant, for example:
```bash
--model-name meta-llama/Meta-Llama-3.1-8B-Instruct
--kv-cache-dtype bf16
```
## 8. Multi-node Deployment
To deploy the solution in a multi-node environment please refer to [deploy_llama_8b_disaggregated_multinode.sh](examples/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh) script. On a head node run NATS server, API server and context worker with
```
./examples/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh context --head-url <head url>
```
On the second node run the generate worker
```
./examples/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh generate --head-url <head url>
```
The example script is set by default to launch one context worker with TP 1 on the head node and one generate worker with TP 1 on the secondary node. This can be changed for other configurations - see the script for details.
## 9. Known Issues & Limitations
1. **Fixed Worker Count**
Currently, the number of prefill and decode workers must be fixed at the start of deployment. Dynamically adding or removing workers is not yet supported.
2. **KV Transfer OOM**
During heavy loads, KV cache transfers between prefill and decode processes may cause out-of-memory errors if there is insufficient GPU memory.
3. **KV Cache Preemption**
Cache preemption (evicting old prompts to free memory) is not supported in the current patch.
4. **Experimental Patch**
The required vLLM patch is experimental and not yet merged into upstream vLLM. Future releases may remove the need for a custom patch.
5. **Single generate worker**
Only one generate worker can be used in a single deployment.
6. **Streaming**
When streaming is enabled, only two responses will be returned in the stream: the first token and the complete response.
## 10. References
[^1]: Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao
Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language
model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024.
For more details on Triton Distributed and additional examples, please consult the official [Hello World example](../../hello_world/) and the [Triton Inference Server documentation](https://github.com/triton-inference-server/server).
<!--
SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Tuning and Benchmarking Disaggregated Serving
**Disaggregated Serving** [^1] enables developers and teams deploying
LLMs to tune their deployment based on input and output sequence
lengths to achieve a targeted SLA with the right mix of context and
generation workers. In particular disaggregated serving enables teams
the ability to choose different parallelization strategies for each
phase and balance throughput (tokens / sec / gpu) and latency (tokens
/ sec / user).
## Example:
### 50 tokens per sec SLA with Input (3000) / Output (150) Sequence Length Tuning
To determine the best mix of context and generate workers for a
targeted latency and input and output sequence length generally we
perform "sweeps" comparing different strategies to find the best
throughput within the SLA.
For example for input sequence length 3000 and output sequence length
150 after sweeping different tensor parallellism strategies on two
8 x H100 GPU nodes, we've found that using 4 instances of TP 2 for
context (on one node) and using 1 instance of TP 8 for generate (on
the second node) gives the best throughput at a latency target of 50
tokens per sec per user.
At that latency target, in our early measurements disaggregated
serving outperforms traditional aggregated LLM serving by more than 1.5x
(with throughput normalized per GPU).
### Reproducing Results
To reproduce similar results on a 2 node H100 x 8 GPU system we
provide sample scripts.
### Launch Context Workers on First Node
On first (head) node:
```
bash deploy_llama_70b_context_tp2dp4.sh --head-url <head url>
```
### Launch Generate Worker on Second Node
On second node:
```
bash deploy_llama_70b_generate_tp8dp1.sh --head-url <head url>
```
### Benchmark
The following `genai-perf` command simulates traffic with 3000 input and 150 output sequence lengths.
```
genai-perf profile \
-m llama \
--url <api server url> \
--endpoint-type chat \
--streaming \
--num-dataset-entries 100 \
--service-kind openai \
--endpoint v1/chat/completions \
--warmup-request-count 10 \
--random-seed 123 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-stddev 0 \
--tokenizer neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
--synthetic-input-tokens-mean 3000 \
--output-tokens-mean 150 \
--extra-inputs seed:100 \
--extra-inputs min_tokens:150 \
--extra-inputs max_tokens:150 \
--profile-export-file my_profile_export.json \
--artifact-dir artifacts/ \
--concurrency < N > \
--request-count < 10 * N > \
-- -v \
--async
```
### Example Results
The following results are given as an example, are not fully
optimized, and do not indicate what you may get locally.
| label | configuration | concurrency | output token throughput per request | output token throughput per gpu | time to first token | inter token latency |
|----------|--------------------------------|-------------|-------------------------------------|---------------------------------|---------------------|---------------------|
| disagg | context tp2dp4 generate tp8dp1 | 48 | 49.18197330348195 | 87.55798331 | 1157.4852116520833 | 15.935926391666667 |
| baseline | baseline tp4dp1 | 4 | 50.27116554062172 | 56.26445983 | 709.2506074249999 | 15.265875249999999 |
### Baseline Comparison
On a single node you can run a comparison. With aggregated workers we
found the best throughput at the target SLA and input and output
sequence lengths with 2 instances of tensor parallelism 4.
```
bash deploy_llama_70b_baseline_tp4dp2.sh --head-url <head url>
```
To see the results use the same `genai-perf` command used to benchmark
the disaggregated setup.
### Stopping deployment
```
pkill -SIGINT -f python3
pkill -SIGINT -f nats
```
## Known issue
Sometimes during the first run there there are nats errors. In that case just restart the deployment.
## References
[^1]: Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao
Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language
model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024.
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TORCH_PORT=36183
export VLLM_CONTEXT_WORKERS=4
export VLLM_CONTEXT_TP_SIZE=2
export VLLM_GENERATE_WORKERS=1
export VLLM_GENERATE_TP_SIZE=8
export VLLM_LOGGING_LEVEL=INFO
export VLLM_DATA_PLANE_BACKEND=nccl
export PYTHONUNBUFFERED=1
export NATS_PORT=4223
export NATS_STORE="$(mktemp -d)"
export API_SERVER_PORT=8005
if [ "$1" != "--head-url" ] || [ -z "$2" ]; then
echo "Usage: $0 --head-url <head url>"
exit 1
fi
head_url=$2
export NATS_HOST="$head_url"
export VLLM_TORCH_HOST="$head_url"
export API_SERVER_HOST="$head_url"
# Start NATS Server
echo "Flushing NATS store: ${NATS_STORE}..."
rm -r "${NATS_STORE}"
echo "Starting NATS Server..."
nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
# Start API Server
echo "Starting LLM API Server..."
python3 -m llm.api_server \
--tokenizer neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--api-server-host ${API_SERVER_HOST} \
--model-name "llama" \
--api-server-port ${API_SERVER_PORT} &
# Empty --log-dir will dump logs to stdout
echo "Starting vLLM baseline workers..."
gpu_configs=(
"0,1"
"2,3"
"4,5"
"6,7"
)
for i in "${!gpu_configs[@]}"; do
CUDA_VISIBLE_DEVICES="${gpu_configs[$i]}" \
VLLM_WORKER_ID=$i \
python3 -m llm.vllm.deploy \
--context-worker-count 1 \
--context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
--generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
--worker-name llama \
--kv-cache-dtype fp8 \
--dtype auto \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 3500 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.5 &
done
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TORCH_PORT=36183
export VLLM_CONTEXT_WORKERS=4
export VLLM_CONTEXT_TP_SIZE=2
export VLLM_GENERATE_WORKERS=1
export VLLM_GENERATE_TP_SIZE=8
export VLLM_LOGGING_LEVEL=INFO
export VLLM_DATA_PLANE_BACKEND=nccl
export PYTHONUNBUFFERED=1
export NATS_PORT=4223
export NATS_STORE="$(mktemp -d)"
export API_SERVER_PORT=8005
if [ "$1" != "--head-url" ] || [ -z "$2" ]; then
echo "Usage: $0 --head-url <head url>"
exit 1
fi
head_url=$2
export NATS_HOST="$head_url"
export VLLM_TORCH_HOST="$head_url"
export API_SERVER_HOST="$head_url"
# Empty --log-dir will dump logs to stdout
echo "Starting vLLM generate workers..."
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
VLLM_WORKER_ID=${VLLM_CONTEXT_WORKERS} \
python3 -m llm.vllm.deploy \
--generate-worker-count 1 \
--context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
--generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
--worker-name llama \
--kv-cache-dtype fp8 \
--dtype auto \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 3500 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 &
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import signal
import sys
import time
from pathlib import Path
from llm.vllm.operators.vllm import (
VllmContextOperator,
VllmGenerateOperator,
VllmOperator,
)
from triton_distributed.runtime import Deployment, OperatorConfig, WorkerConfig
from .parser import parse_args
deployment = None
def handler(signum, frame):
exit_code = 0
if deployment:
print("Stopping Workers")
exit_code = deployment.stop()
print(f"Workers Stopped Exit Code {exit_code}")
sys.exit(exit_code)
signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
for sig in signals:
try:
signal.signal(sig, handler)
except Exception:
pass
def _create_context_op(name, args, max_inflight_requests):
return OperatorConfig(
name=name,
implementation=VllmContextOperator,
max_inflight_requests=int(max_inflight_requests),
parameters=vars(args),
)
def _create_generate_op(name, args, max_inflight_requests):
return OperatorConfig(
name=name,
implementation=VllmGenerateOperator,
max_inflight_requests=int(max_inflight_requests),
parameters=vars(args),
)
def _create_baseline_op(name, args, max_inflight_requests):
return OperatorConfig(
name=name,
implementation=VllmOperator,
max_inflight_requests=int(max_inflight_requests),
parameters=vars(args),
)
def main(args):
global deployment
if args.log_dir:
log_dir = Path(args.log_dir)
log_dir.mkdir(exist_ok=True)
worker_configs = []
# Context/Generate workers used for Disaggregated Serving
if args.context_worker_count == 1:
context_op = _create_context_op(args.worker_name, args, 1000)
context = WorkerConfig(
operators=[context_op],
# Context worker gets --worker-name as it is the model that will
# be hit first in a disaggregated setting.
name=args.worker_name,
)
worker_configs.append((context, 1))
if args.generate_worker_count == 1:
generate_op = _create_generate_op("generate", args, 1000)
generate = WorkerConfig(
operators=[generate_op],
# Generate worker gets a hard-coded name "generate" as the context
# worker will talk directly to it.
name="generate",
)
worker_configs.append((generate, 1))
# NOTE: Launching baseline worker and context/generate workers at
# the same time is not currently supported.
if args.baseline_worker_count == 1:
# Baseline worker has a hard-coded name just for testing purposes
baseline_op = _create_baseline_op("baseline", args, 1000)
baseline = WorkerConfig(
operators=[baseline_op],
name="baseline",
)
worker_configs.append((baseline, 1))
deployment = Deployment(
worker_configs,
initialize_request_plane=args.initialize_request_plane,
log_dir=args.log_dir,
log_level=args.log_level,
starting_metrics_port=args.starting_metrics_port,
request_plane_args=([], {"request_plane_uri": args.request_plane_uri}),
)
deployment.start()
print("Workers started ... press Ctrl-C to Exit")
while True:
time.sleep(10)
if __name__ == "__main__":
args = parse_args()
main(args)
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# FIXME: Convert this script to README steps
export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TORCH_HOST=localhost
export VLLM_TORCH_PORT=36183
export VLLM_BASELINE_WORKERS=1
export VLLM_BASELINE_TP_SIZE=1
export VLLM_LOGGING_LEVEL=INFO
export VLLM_DATA_PLANE_BACKEND=nccl
export PYTHONUNBUFFERED=1
export NATS_HOST=localhost
export NATS_PORT=4223
export NATS_STORE="$(mktemp -d)"
export API_SERVER_HOST=localhost
export API_SERVER_PORT=8005
# Start NATS Server
echo "Flushing NATS store: ${NATS_STORE}..."
rm -r "${NATS_STORE}"
echo "Starting NATS Server..."
nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
# Start API Server
echo "Starting LLM API Server..."
python3 -m llm.api_server \
--tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--api-server-host ${API_SERVER_HOST} \
--model-name "baseline" \
--api-server-port ${API_SERVER_PORT} &
# Empty --log-dir will dump logs to stdout
echo "Starting vLLM baseline workers..."
CUDA_VISIBLE_DEVICES=0 \
VLLM_WORKER_ID=0 \
python3 -m llm.vllm.deploy \
--baseline-worker-count ${VLLM_BASELINE_WORKERS} \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 1000 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--baseline-tp-size ${VLLM_BASELINE_TP_SIZE} \
--log-dir ""
# NOTE: It may take more than a minute for the vllm worker to start up
# if the model weights aren't cached and need to be downloaded.
echo "Waiting for deployment to finish startup..."
sleep 60
# Make a Chat Completion Request
echo "Sending chat completions request..."
curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "baseline",
"messages": [
{"role": "user", "content": "What is the capital of France?"}
],
"temperature": 0,
"top_p": 0.95,
"max_tokens": 25,
"stream": true,
"n": 1,
"frequency_penalty": 0.0,
"stop": []
}'
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# FIXME: Convert this script to README steps
export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TORCH_HOST=localhost
export VLLM_TORCH_PORT=36183
export VLLM_BASELINE_WORKERS=0
export VLLM_CONTEXT_WORKERS=1
export VLLM_GENERATE_WORKERS=1
export VLLM_BASELINE_TP_SIZE=1
export VLLM_CONTEXT_TP_SIZE=1
export VLLM_GENERATE_TP_SIZE=1
export VLLM_LOGGING_LEVEL=INFO
export VLLM_DATA_PLANE_BACKEND=nccl
export PYTHONUNBUFFERED=1
export NATS_HOST=localhost
export NATS_PORT=4223
export NATS_STORE="$(mktemp -d)"
export API_SERVER_HOST=localhost
export API_SERVER_PORT=8005
# Start NATS Server
echo "Flushing NATS store: ${NATS_STORE}..."
rm -r "${NATS_STORE}"
echo "Starting NATS Server..."
nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
# Start API Server
echo "Starting LLM API Server..."
python3 -m llm.api_server \
--tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--api-server-host ${API_SERVER_HOST} \
--model-name llama \
--api-server-port ${API_SERVER_PORT} &
# Start VLLM Worker 0
echo "Starting vLLM context workers..."
CUDA_VISIBLE_DEVICES=0 \
VLLM_WORKER_ID=0 \
python3 -m llm.vllm.deploy \
--context-worker-count ${VLLM_CONTEXT_WORKERS} \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--worker-name llama \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 1000 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
--generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
--log-dir "/tmp/vllm_logs" &
# Start VLLM Worker 1
echo "Starting vLLM generate workers..."
CUDA_VISIBLE_DEVICES=1 \
VLLM_WORKER_ID=1 \
python3 -m llm.vllm.deploy \
--generate-worker-count ${VLLM_GENERATE_WORKERS} \
--request-plane-uri ${NATS_HOST}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--worker-name llama \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 1000 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
--generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
--log-dir "/tmp/vllm_logs" &
# NOTE: It may take more than a minute for the vllm worker to start up
# if the model weights aren't cached and need to be downloaded.
echo "Waiting for deployment to finish startup..."
echo "Once you see all ranks connected to the server, it should be ready..."
echo "Example output:"
echo "\tRank 0 connected to the server"
echo "\t..."
echo "\tRank 1 connected to the server"
sleep 120
# Make a Chat Completion Request
echo "Sending chat completions request..."
curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama",
"messages": [
{"role": "system", "content": "What is the capital of France?"}
],
"temperature": 0,
"top_p": 0.95,
"max_tokens": 25,
"stream": true,
"n": 1,
"frequency_penalty": 0.0,
"stop": []
}'
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TORCH_HOST=""
export VLLM_TORCH_PORT=36183
export VLLM_BASELINE_WORKERS=0
export VLLM_CONTEXT_WORKERS=1
export VLLM_GENERATE_WORKERS=1
export VLLM_BASELINE_TP_SIZE=1
export VLLM_CONTEXT_TP_SIZE=1
export VLLM_GENERATE_TP_SIZE=1
export VLLM_LOGGING_LEVEL=INFO
export VLLM_DATA_PLANE_BACKEND=nccl
export PYTHONUNBUFFERED=1
export NATS_HOST=""
export NATS_PORT=4223
export NATS_STORE="$(mktemp -d)"
export API_SERVER_HOST=""
export API_SERVER_PORT=8005
start_nats_server() {
local head_url=$1
export NATS_HOST="$head_url"
echo "Flushing NATS store: ${NATS_STORE}..."
rm -r "${NATS_STORE}"
echo "Starting NATS Server..."
nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
}
start_api_server() {
local head_url=$1
export VLLM_TORCH_HOST="$head_url"
echo "Starting LLM API Server..."
python3 -m llm.api_server \
--tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--request-plane-uri ${head_url}:${NATS_PORT} \
--api-server-host ${API_SERVER_HOST} \
--model-name llama \
--api-server-port ${API_SERVER_PORT} &
}
start_context_worker() {
local head_url=$1
export VLLM_TORCH_HOST="$head_url"
echo "Starting vLLM context workers..."
CUDA_VISIBLE_DEVICES=0 \
VLLM_WORKER_ID=0 \
python3 -m llm.vllm.deploy \
--context-worker-count ${VLLM_CONTEXT_WORKERS} \
--request-plane-uri ${head_url}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--worker-name llama \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 1000 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
--generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
--log-dir "/tmp/vllm_logs" &
}
start_generate_worker() {
local head_url=$1
export VLLM_TORCH_HOST="$head_url"
echo "Starting vLLM generate workers..."
CUDA_VISIBLE_DEVICES=1 \
VLLM_WORKER_ID=1 \
python3 -m llm.vllm.deploy \
--generate-worker-count ${VLLM_GENERATE_WORKERS} \
--request-plane-uri ${head_url}:${NATS_PORT} \
--model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
--kv-cache-dtype fp8 \
--dtype auto \
--worker-name llama \
--disable-async-output-proc \
--disable-log-stats \
--max-model-len 1000 \
--max-batch-size 10000 \
--gpu-memory-utilization 0.9 \
--context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
--generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
--log-dir "/tmp/vllm_logs" &
}
case "$1" in
context)
if [ "$2" != "--head-url" ] || [ -z "$3" ]; then
echo "Usage: $0 context --head-url <head url>"
exit 1
fi
head_url=$3
export API_SERVER_HOST="$head_url"
start_nats_server
start_api_server "$head_url"
start_context_worker "$head_url"
;;
generate)
if [ "$2" != "--head-url" ] || [ -z "$3" ]; then
echo "Usage: $0 generate --head-url <head url>"
exit 1
fi
head_url=$3
export API_SERVER_HOST="$head_url"
start_generate_worker "$head_url"
;;
*)
echo "Usage: $0 {context|generate} --head-url <head url>"
exit 1
;;
esac
echo "Waiting for deployment to finish startup..."
echo "Once you see all ranks connected to the server, it should be ready..."
echo "Example output:"
echo "\tRank 0 connected to the server"
echo "\t..."
echo "\tRank 1 connected to the server"
sleep 120
echo "Sending chat completions request..."
curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama",
"messages": [
{"role": "system", "content": "What is the capital of France?"}
],
"temperature": 0,
"top_p": 0.95,
"max_tokens": 25,
"stream": true,
"n": 1,
"frequency_penalty": 0.0,
"stop": []
}'
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
# FIXME: Remove unused args if any
def parse_args():
parser = argparse.ArgumentParser(description="Run an example of the VLLM pipeline.")
# example_dir = Path(__file__).parent.absolute().parent.absolute()
# default_log_dir = "" example_dir.joinpath("logs")
default_log_dir = ""
parser.add_argument(
"--log-dir",
type=str,
default=str(default_log_dir),
help="log dir folder",
)
parser.add_argument(
"--request-plane-uri",
type=str,
default="nats://localhost:4223",
help="URI of request plane",
)
parser.add_argument(
"--initialize-request-plane",
default=False,
action="store_true",
help="Initialize the request plane, should only be done once per deployment",
)
parser.add_argument(
"--starting-metrics-port",
type=int,
default=0,
help="Metrics port for first worker. Each worker will expose metrics on subsequent ports, ex. worker 1: 50000, worker 2: 50001, worker 3: 50002",
)
parser.add_argument(
"--context-worker-count",
type=int,
required=False,
default=0,
help="Number of context workers",
)
parser.add_argument(
"--dummy-worker-count",
type=int,
required=False,
default=0,
help="Number of dummy workers",
)
parser.add_argument(
"--baseline-worker-count",
type=int,
required=False,
default=0,
help="Number of baseline workers",
)
parser.add_argument(
"--generate-worker-count",
type=int,
required=False,
default=0,
help="Number of generate workers",
)
parser.add_argument(
"--nats-url",
type=str,
required=False,
default="nats://localhost:4223",
help="URL of NATS server",
)
parser.add_argument(
"--model-name",
type=str,
required=False,
default="meta-llama/Meta-Llama-3.1-8B-Instruct",
help="Model name",
)
parser.add_argument(
"--worker-name",
type=str,
required=False,
default="llama",
help="Worker name",
)
parser.add_argument(
"--max-model-len",
type=int,
required=False,
default=None,
help="Maximum input/output latency length.",
)
parser.add_argument(
"--max-batch-size",
type=int,
required=False,
default=10000,
help="Max batch size",
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
required=False,
default=0.45,
help="GPU memory utilization (fraction of memory from 0.0 to 1.0)",
)
parser.add_argument(
"--dtype",
type=str,
required=False,
default="float16",
help="Attention data type (float16, TODO: fp8)",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
required=False,
default="auto",
help="Key-value cache data type",
)
# FIXME: Support string values like 'debug', 'info, etc.
parser.add_argument(
"--log-level",
type=int,
required=False,
choices=[0, 1, 2],
default=1,
help="Logging level: 2=debug, 1=info, 0=error (default=1)",
)
## Logical arguments for vLLM engine
parser.add_argument(
"--enable-prefix-caching",
action=argparse.BooleanOptionalAction,
required=False,
default=False,
help="Enable prefix caching",
)
parser.add_argument(
"--enable-chunked-prefill",
action=argparse.BooleanOptionalAction,
required=False,
default=False,
help="Enable chunked prefill",
)
parser.add_argument(
"--enforce-eager",
action=argparse.BooleanOptionalAction,
required=False,
default=False,
help="Enforce eager execution",
)
parser.add_argument(
"--ignore-eos",
action=argparse.BooleanOptionalAction,
required=False,
default=False,
help="Ignore EOS token when generating",
)
parser.add_argument(
"--baseline-tp-size",
type=int,
default=1,
help="Tensor parallel size of a baseline worker.",
)
parser.add_argument(
"--context-tp-size",
type=int,
default=1,
help="Tensor parallel size of a context worker.",
)
parser.add_argument(
"--generate-tp-size",
type=int,
default=1,
help="Tensor parallel size of a generate worker.",
)
parser.add_argument(
"--max-num-seqs",
type=int,
default=None,
help="maximum number of sequences per iteration",
)
parser.add_argument(
"--disable-async-output-proc",
action="store_true",
help="Disable async output processing",
)
parser.add_argument(
"--disable-log-stats",
action="store_true",
help="Disable logging statistics",
)
return parser.parse_args()
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import inspect
import os
import time
from typing import Any, AsyncGenerator, Dict, Optional
import numpy as np
import vllm.engine.arg_utils
import vllm.engine.async_llm_engine
import vllm.inputs.data
LOGGER = vllm.logger.init_logger(__name__)
# TODO ptarasiewicz remove after veryfing streaming works efficiently
# FIXME currently streaming all the tokens is not efficient
# with RETURN_EVERY_N so large we return only first token and whole sequence at the end
RETURN_EVERY_N = 1
class Stage(abc.ABC):
@abc.abstractmethod
async def __call__(
self, input_payload: Dict[str, Any]
) -> AsyncGenerator[Dict[str, Any], None]:
yield {}
class AggregatedStage(Stage):
def __init__(
self,
**kwargs,
):
self._ignore_eos = kwargs.pop("ignore_eos", False)
engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**kwargs)
LOGGER.info(f"Creating engine with args: {engine_args}")
self._engine = vllm.engine.async_llm_engine.AsyncLLMEngine.from_engine_args(
engine_args
)
LOGGER.info(f"Created engine: {self._engine}")
async def __call__(
self, input_payload: Dict[str, Any]
) -> AsyncGenerator[Dict[str, Any], None]:
try:
vllm_input = input_payload["parameters"]["prompt"]
sampling_params = vllm.SamplingParams(
**input_payload["parameters"].get("sampling_params", {}),
ignore_eos=self._ignore_eos,
)
LOGGER.debug(f"sampling_params: {sampling_params}")
request_id = input_payload["parameters"].get("request_id", None)
results_generator = self._engine.generate(
vllm_input, sampling_params, request_id
)
LOGGER.debug("results_generator started")
counter = 0
async for result in results_generator:
if counter % RETURN_EVERY_N == 0 or result.finished:
tokens_ids = np.stack(
[output_row.token_ids for output_row in result.outputs]
).astype(np.int64)
LOGGER.debug(f"tokens_ids: {tokens_ids.shape}")
yield {
"outputs": {},
"error": None,
"final": result.finished,
"parameters": {
"text": result.outputs[0].text,
},
}
counter += 1
LOGGER.debug("results_generator finished")
except Exception as e:
LOGGER.error(f"Exception in SingleComputePipeline: {e}")
yield {"outputs": {}, "error": str(e), "final": True}
class PrefillStage(Stage):
def __init__(
self,
generate_tensor_parallel_size: Optional[int] = None,
**kwargs,
):
context_tensor_parallel_size = kwargs.get("tensor_parallel_size", 1)
generate_tensor_parallel_size = (
generate_tensor_parallel_size or context_tensor_parallel_size
)
assert (
generate_tensor_parallel_size % context_tensor_parallel_size == 0
), "generate_tensor_parallel_size must be multiple of context_tensor_parallel_size"
LOGGER.debug(f"context_tensor_parallel_size: {context_tensor_parallel_size}")
LOGGER.debug(f"generate_tensor_parallel_size: {generate_tensor_parallel_size}")
os.environ["VLLM_DISAGG_STAGE"] = "PREFILL"
os.environ["VLLM_CONTEXT_TP_SIZE"] = str(context_tensor_parallel_size)
os.environ["VLLM_GENERATE_TP_SIZE"] = str(generate_tensor_parallel_size)
LOGGER.info(f"Env VLLM_DISAGG_STAGE set to {os.environ['VLLM_DISAGG_STAGE']}")
kwargs[
"enforce_eager"
] = True # Prefill stage must be eager because of variable ISL
self._ignore_eos = kwargs.pop("ignore_eos", False)
engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**kwargs)
LOGGER.info(f"Creating engine with args: {engine_args}")
self._engine = vllm.engine.async_llm_engine.AsyncLLMEngine.from_engine_args(
engine_args
)
LOGGER.info("Prefill stage initialized")
async def __call__(
self, input_payload: Dict[str, Any]
) -> AsyncGenerator[Dict[str, Any], None]:
try:
vllm_input = input_payload["parameters"]["prompt"]
request_id = input_payload["parameters"].get("request_id", None)
assert request_id is not None, "request_id is required for prefill"
sampling_params = vllm.SamplingParams(
**input_payload["parameters"].get("sampling_params", {}),
ignore_eos=self._ignore_eos,
)
old_my_max_tokens = sampling_params.max_tokens
old_my_min_tokens = sampling_params.min_tokens
sampling_params.max_tokens = 1
sampling_params.min_tokens = 1
LOGGER.debug(f"sampling_params: {sampling_params}")
start_time_ns = time.monotonic_ns()
results_generator = self._engine.generate(
vllm_input, sampling_params, request_id
)
LOGGER.debug("results_generator started")
async for result in results_generator:
taken_ms = (time.monotonic_ns() - start_time_ns) / 1_000_000
LOGGER.info(
"==== Prefill completed kv cache taken %0.3fms ====", taken_ms
)
# TODO: needed to pass prompt, request_id, sampling_params to the next stage as there is no pipeline concept in online scenario
sampling_params.max_tokens = old_my_max_tokens
sampling_params.min_tokens = old_my_min_tokens
sampling_params_init_names = inspect.signature(
vllm.SamplingParams
).parameters.keys()
sampling_params = {
k: v
for k, v in sampling_params.__dict__.items()
if k in sampling_params_init_names
}
LOGGER.debug(
f"Yield response {input_payload['inputs'].keys()} parameters {input_payload['parameters']}"
)
yield {
"outputs": {}, # See line 195 for context
"error": None,
"parameters": {
"context_worker_id": os.environ["VLLM_WORKER_ID"],
"first_token": result.outputs[0].token_ids[0],
"seq_len": len(result.prompt_token_ids),
},
"final": True,
}
LOGGER.debug("Results generator for prefill finishes")
except Exception as e:
LOGGER.error(f"Exception in SingleComputePipeline: {e}")
yield {"outputs": {}, "error": str(e), "final": True}
class GenerateStage(Stage):
def __init__(
self,
**kwargs,
):
os.environ["VLLM_DISAGG_STAGE"] = "GENERATE"
LOGGER.info(f"Env VLLM_DISAGG_STAGE set to {os.environ['VLLM_DISAGG_STAGE']}")
self._ignore_eos = kwargs.pop("ignore_eos", False)
engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**kwargs)
LOGGER.info(f"Creating engine with args: {engine_args}")
self._engine = vllm.engine.async_llm_engine.AsyncLLMEngine.from_engine_args(
engine_args
)
LOGGER.info("Generation stage initialized")
async def __call__(
self, input_payload: Dict[str, Any]
) -> AsyncGenerator[Dict[str, Any], None]:
seq_len = input_payload["parameters"]["seq_len"]
LOGGER.debug(f"input sequence length: {seq_len}")
# we can use any tokens because first token is already sampled by the context worker
# and we just need the correct shape to allocate space in the kv cache
vllm_input = vllm.inputs.data.TokensPrompt(prompt_token_ids=[0] * seq_len)
sampling_params = vllm.SamplingParams(
**input_payload["parameters"].get("sampling_params", {}),
ignore_eos=self._ignore_eos,
)
LOGGER.debug(f"sampling_params: {sampling_params}")
request_id = input_payload["parameters"].get("request_id", None)
assert request_id is not None, "request_id is required for generate"
context_worker_id = input_payload["parameters"]["context_worker_id"]
new_request_id = f"{request_id}___{context_worker_id}"
first_token = input_payload["parameters"]["first_token"]
self._engine.engine.model_executor.driver_worker.model_runner.set_first_token(
new_request_id, first_token
)
# TODO ptarasiewicz this is only temporary way to pass worker id to the engine
# so that it can pull the correct kv cache
results_generator = self._engine.generate(
vllm_input,
sampling_params,
new_request_id,
)
LOGGER.debug("results_generator started")
counter = 0
async for result in results_generator:
if counter % RETURN_EVERY_N == 0 or result.finished:
yield {
"outputs": {},
"error": None,
"final": result.finished,
"parameters": {
"text": result.outputs[0].text,
},
}
counter += 1
LOGGER.debug("results_generator finished for generate")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment