Unverified Commit c9d838fc authored by RonaldBXu's avatar RonaldBXu Committed by GitHub
Browse files

Adding deterministic lora benchmarking to vLLM Bench (#36057)


Signed-off-by: default avatarUbuntu <ubuntu@ip-172-31-43-201.ap-northeast-1.compute.internal>
Signed-off-by: default avatarRonald Xu <ronaldxu@amazon.com>
parent b1169d7b
...@@ -183,6 +183,68 @@ class BenchmarkDataset(ABC): ...@@ -183,6 +183,68 @@ class BenchmarkDataset(ABC):
) )
return lora_request return lora_request
def get_round_robin_lora_request(
self,
index: int,
max_loras: int | None = None,
lora_path: str | None = None,
) -> LoRARequest | None:
"""
Optionally select a LoRA request using deterministic round-robin.
This method cycles through LoRA IDs in order based on the request
index, providing reproducible LoRA assignment.
Args:
index (int): The request index used for round-robin selection.
max_loras (Optional[int]): The maximum number of LoRAs available.
If `None`, LoRA is not used.
lora_path (Optional[str]): Path to the LoRA parameters on disk.
If `None`, LoRA is not used.
Returns:
A new [`LoRARequest`][vllm.lora.request.LoRARequest]
(or `None` if not applicable).
"""
if max_loras is None or lora_path is None:
return None
# Deterministic round-robin: cycle through [1, max_loras]
lora_id = index % max_loras + 1
lora_request = LoRARequest(
lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(lora_path),
)
return lora_request
def get_lora_request(
self,
index: int,
max_loras: int | None = None,
lora_path: str | None = None,
lora_assignment: str = "random",
) -> LoRARequest | None:
"""
Select a LoRA request using the specified assignment strategy.
Args:
index (int): The request index (used for round-robin).
max_loras (Optional[int]): The maximum number of LoRAs available.
lora_path (Optional[str]): Path to the LoRA parameters on disk.
lora_assignment (str): Strategy for LoRA selection.
'random' (default) or 'round-robin'.
Returns:
A new [`LoRARequest`][vllm.lora.request.LoRARequest]
(or `None` if not applicable).
"""
if lora_assignment == "round-robin":
return self.get_round_robin_lora_request(
index=index, max_loras=max_loras, lora_path=lora_path
)
return self.get_random_lora_request(max_loras=max_loras, lora_path=lora_path)
@abstractmethod @abstractmethod
def sample( def sample(
self, self,
...@@ -478,6 +540,9 @@ class RandomDataset(BenchmarkDataset): ...@@ -478,6 +540,9 @@ class RandomDataset(BenchmarkDataset):
input_len: int = DEFAULT_INPUT_LEN, input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN,
batchsize: int = 1, batchsize: int = 1,
max_loras: int | None = None,
lora_path: str | None = None,
lora_assignment: str = "random",
**kwargs, **kwargs,
) -> list[SampleRequest]: ) -> list[SampleRequest]:
# validate total input tokens (prefix + sampled) is at least 1. # validate total input tokens (prefix + sampled) is at least 1.
...@@ -522,11 +587,18 @@ class RandomDataset(BenchmarkDataset): ...@@ -522,11 +587,18 @@ class RandomDataset(BenchmarkDataset):
allowed_tokens=allowed_tokens, allowed_tokens=allowed_tokens,
) )
token_mismatch_total += token_mismatch token_mismatch_total += token_mismatch
lora_req = self.get_lora_request(
index=i,
max_loras=max_loras,
lora_path=lora_path,
lora_assignment=lora_assignment,
)
requests.append( requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
prompt_len=total_input_len, prompt_len=total_input_len,
expected_output_len=int(output_lens[i]), expected_output_len=int(output_lens[i]),
lora_request=lora_req,
request_id=request_id_prefix + str(i), request_id=request_id_prefix + str(i),
) )
) )
...@@ -1263,6 +1335,7 @@ class ShareGPTDataset(BenchmarkDataset): ...@@ -1263,6 +1335,7 @@ class ShareGPTDataset(BenchmarkDataset):
enable_multimodal_chat: bool = False, enable_multimodal_chat: bool = False,
request_id_prefix: str = "", request_id_prefix: str = "",
no_oversample: bool = False, no_oversample: bool = False,
lora_assignment: str = "random",
**kwargs, **kwargs,
) -> list: ) -> list:
samples: list = [] samples: list = []
...@@ -1275,8 +1348,11 @@ class ShareGPTDataset(BenchmarkDataset): ...@@ -1275,8 +1348,11 @@ class ShareGPTDataset(BenchmarkDataset):
entry["conversations"][1]["value"], entry["conversations"][1]["value"],
) )
lora_request = self.get_random_lora_request( lora_request = self.get_lora_request(
max_loras=max_loras, lora_path=lora_path index=ind,
max_loras=max_loras,
lora_path=lora_path,
lora_assignment=lora_assignment,
) )
prompt_ids = tokenizer(prompt).input_ids prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids completion_ids = tokenizer(completion).input_ids
...@@ -2413,6 +2489,7 @@ class BurstGPTDataset(BenchmarkDataset): ...@@ -2413,6 +2489,7 @@ class BurstGPTDataset(BenchmarkDataset):
lora_path: str | None = None, lora_path: str | None = None,
request_id_prefix: str = "", request_id_prefix: str = "",
no_oversample: bool = False, no_oversample: bool = False,
lora_assignment: str = "random",
**kwargs, **kwargs,
) -> list[SampleRequest]: ) -> list[SampleRequest]:
samples = [] samples = []
...@@ -2420,8 +2497,11 @@ class BurstGPTDataset(BenchmarkDataset): ...@@ -2420,8 +2497,11 @@ class BurstGPTDataset(BenchmarkDataset):
for i in range(num_requests): for i in range(num_requests):
input_len = int(data[i][2]) input_len = int(data[i][2])
output_len = int(data[i][3]) output_len = int(data[i][3])
lora_req = self.get_random_lora_request( lora_req = self.get_lora_request(
max_loras=max_loras, lora_path=lora_path index=i,
max_loras=max_loras,
lora_path=lora_path,
lora_assignment=lora_assignment,
) )
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
# Generate a synthetic prompt: a list of token IDs computed as (i + # Generate a synthetic prompt: a list of token IDs computed as (i +
......
...@@ -624,6 +624,7 @@ async def benchmark( ...@@ -624,6 +624,7 @@ async def benchmark(
lora_modules: Iterable[str] | None, lora_modules: Iterable[str] | None,
extra_headers: dict | None, extra_headers: dict | None,
extra_body: dict | None, extra_body: dict | None,
lora_assignment: Literal["random", "round-robin"] = "random",
ramp_up_strategy: Literal["linear", "exponential"] | None = None, ramp_up_strategy: Literal["linear", "exponential"] | None = None,
ramp_up_start_rps: int | None = None, ramp_up_start_rps: int | None = None,
ramp_up_end_rps: int | None = None, ramp_up_end_rps: int | None = None,
...@@ -731,9 +732,19 @@ async def benchmark( ...@@ -731,9 +732,19 @@ async def benchmark(
print("Starting main benchmark run...") print("Starting main benchmark run...")
if lora_modules: if lora_modules:
lora_modules_list = list(lora_modules)
if lora_assignment == "round-robin":
# Deterministic round-robin assignment across requests.
lora_modules = iter(
[
lora_modules_list[i % len(lora_modules_list)]
for i in range(len(input_requests))
]
)
else:
# For each input request, choose a LoRA module at random. # For each input request, choose a LoRA module at random.
lora_modules = iter( lora_modules = iter(
[random.choice(lora_modules) for _ in range(len(input_requests))] [random.choice(lora_modules_list) for _ in range(len(input_requests))]
) )
if profile: if profile:
...@@ -1523,7 +1534,18 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -1523,7 +1534,18 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=None, default=None,
help="A subset of LoRA module names passed in when " help="A subset of LoRA module names passed in when "
"launching the server. For each request, the " "launching the server. For each request, the "
"script chooses a LoRA module at random.", "script chooses a LoRA module at random by default. "
"Use --lora-assignment to control selection strategy.",
)
parser.add_argument(
"--lora-assignment",
type=str,
default="random",
choices=["random", "round-robin"],
help="Strategy for assigning LoRA modules to requests. "
"'random' (default) selects a LoRA at random for each request. "
"'round-robin' cycles through LoRA modules deterministically.",
) )
parser.add_argument( parser.add_argument(
...@@ -1788,6 +1810,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ...@@ -1788,6 +1810,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
goodput_config_dict=goodput_config_dict, goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules, lora_modules=args.lora_modules,
lora_assignment=args.lora_assignment,
extra_headers=headers, extra_headers=headers,
extra_body=extra_body, extra_body=extra_body,
ramp_up_strategy=args.ramp_up_strategy, ramp_up_strategy=args.ramp_up_strategy,
......
...@@ -350,6 +350,7 @@ def get_requests(args, tokenizer): ...@@ -350,6 +350,7 @@ def get_requests(args, tokenizer):
"tokenizer": tokenizer, "tokenizer": tokenizer,
"lora_path": args.lora_path, "lora_path": args.lora_path,
"max_loras": args.max_loras, "max_loras": args.max_loras,
"lora_assignment": getattr(args, "lora_assignment", "random"),
"num_requests": args.num_prompts, "num_requests": args.num_prompts,
} }
...@@ -778,6 +779,15 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -778,6 +779,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Path to the lora adapters to use. This can be an absolute path, " help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.", "a relative path, or a Hugging Face model identifier.",
) )
parser.add_argument(
"--lora-assignment",
type=str,
default="random",
choices=["random", "round-robin"],
help="Strategy for assigning LoRA adapters to requests. "
"'random' (default) selects a LoRA at random for each request. "
"'round-robin' cycles through LoRAs deterministically.",
)
parser.add_argument( parser.add_argument(
"--prefix-len", "--prefix-len",
type=int, type=int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment