Unverified Commit 7222e1da authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Let bench_one_batch_server use sharegpt data to make expert distribution more natural (#5573)

parent 505eec4d
...@@ -22,6 +22,7 @@ from typing import Tuple ...@@ -22,6 +22,7 @@ from typing import Tuple
import numpy as np import numpy as np
import requests import requests
from sglang.bench_serving import get_tokenizer, sample_random_requests
from sglang.srt.entrypoints.http_server import launch_server from sglang.srt.entrypoints.http_server import launch_server
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
...@@ -117,16 +118,19 @@ def run_one_case( ...@@ -117,16 +118,19 @@ def run_one_case(
input_len_step_percentage: float, input_len_step_percentage: float,
run_name: str, run_name: str,
result_filename: str, result_filename: str,
tokenizer,
): ):
requests.post(url + "/flush_cache") requests.post(url + "/flush_cache")
input_lens = [ input_requests = sample_random_requests(
int(input_len * (1 + (i - (batch_size - 1) / 2) * input_len_step_percentage)) input_len=input_len,
for i in range(batch_size) output_len=output_len,
] num_prompts=batch_size,
input_ids = [ range_ratio=1.0,
[int(x) for x in np.random.randint(0, high=16384, size=(input_lens[i],))] tokenizer=tokenizer,
for i in range(batch_size) dataset_path="",
] random_sample=True,
return_text=False,
)
use_structured_outputs = False use_structured_outputs = False
if use_structured_outputs: if use_structured_outputs:
...@@ -145,8 +149,7 @@ def run_one_case( ...@@ -145,8 +149,7 @@ def run_one_case(
response = requests.post( response = requests.post(
url + "/generate", url + "/generate",
json={ json={
# "text": texts, "input_ids": [input_ids for input_ids, _, _ in input_requests],
"input_ids": input_ids,
"sampling_params": { "sampling_params": {
"temperature": temperature, "temperature": temperature,
"max_new_tokens": output_len, "max_new_tokens": output_len,
...@@ -228,6 +231,9 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): ...@@ -228,6 +231,9 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
else: else:
proc, base_url = launch_server_process(server_args) proc, base_url = launch_server_process(server_args)
tokenizer_id = server_args.tokenizer_path or server_args.model_path
tokenizer = get_tokenizer(tokenizer_id)
# warmup # warmup
if not bench_args.skip_warmup: if not bench_args.skip_warmup:
print("=" * 8 + " Warmup Begin " + "=" * 8) print("=" * 8 + " Warmup Begin " + "=" * 8)
...@@ -241,6 +247,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): ...@@ -241,6 +247,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
input_len_step_percentage=bench_args.input_len_step_percentage, input_len_step_percentage=bench_args.input_len_step_percentage,
run_name="", run_name="",
result_filename="", result_filename="",
tokenizer=tokenizer,
) )
print("=" * 8 + " Warmup End " + "=" * 8 + "\n") print("=" * 8 + " Warmup End " + "=" * 8 + "\n")
......
...@@ -471,6 +471,10 @@ def get_model(pretrained_model_name_or_path: str) -> str: ...@@ -471,6 +471,10 @@ def get_model(pretrained_model_name_or_path: str) -> str:
def get_tokenizer( def get_tokenizer(
pretrained_model_name_or_path: str, pretrained_model_name_or_path: str,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
assert (
pretrained_model_name_or_path is not None
and pretrained_model_name_or_path != ""
)
if pretrained_model_name_or_path.endswith( if pretrained_model_name_or_path.endswith(
".json" ".json"
) or pretrained_model_name_or_path.endswith(".model"): ) or pretrained_model_name_or_path.endswith(".model"):
...@@ -832,6 +836,7 @@ def sample_random_requests( ...@@ -832,6 +836,7 @@ def sample_random_requests(
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
dataset_path: str, dataset_path: str,
random_sample: bool = True, random_sample: bool = True,
return_text: bool = True,
) -> List[DatasetRow]: ) -> List[DatasetRow]:
input_lens = np.random.randint( input_lens = np.random.randint(
max(int(input_len * range_ratio), 1), max(int(input_len * range_ratio), 1),
...@@ -892,10 +897,12 @@ def sample_random_requests( ...@@ -892,10 +897,12 @@ def sample_random_requests(
else: else:
ratio = (input_lens[i] + prompt_len - 1) // prompt_len ratio = (input_lens[i] + prompt_len - 1) // prompt_len
input_ids = (prompt_token_ids * ratio)[: input_lens[i]] input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
prompt = tokenizer.decode(input_ids) input_content = input_ids
if return_text:
input_content = tokenizer.decode(input_content)
input_requests.append( input_requests.append(
DatasetRow( DatasetRow(
prompt=prompt, prompt=input_content,
prompt_len=int(input_lens[i]), prompt_len=int(input_lens[i]),
output_len=int(output_lens[i]), output_len=int(output_lens[i]),
) )
...@@ -905,15 +912,15 @@ def sample_random_requests( ...@@ -905,15 +912,15 @@ def sample_random_requests(
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = [] input_requests = []
for i in range(num_prompts): for i in range(num_prompts):
prompt = tokenizer.decode( input_content = [
[ (offsets[i] + i + j) % tokenizer.vocab_size
(offsets[i] + i + j) % tokenizer.vocab_size for j in range(input_lens[i])
for j in range(input_lens[i]) ]
] if return_text:
) input_content = tokenizer.decode(input_content)
input_requests.append( input_requests.append(
DatasetRow( DatasetRow(
prompt=prompt, prompt=input_content,
prompt_len=int(input_lens[i]), prompt_len=int(input_lens[i]),
output_len=int(output_lens[i]), output_len=int(output_lens[i]),
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment