Unverified Commit 58abe354 authored by Jeremy Arnold's avatar Jeremy Arnold Committed by GitHub
Browse files

[Benchmarks] Make detokenization optional in benchmark scripts (#11697)


Signed-off-by: default avatarJeremy Arnold <Jeremy.Arnold@amd.com>
parent f7ebad23
......@@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
top_p=1.0,
ignore_eos=True,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize,
)
print(sampling_params)
dummy_prompt_token_ids = np.random.randint(10000,
......@@ -173,6 +174,12 @@ if __name__ == "__main__":
default=None,
help="Path to save the latency results in JSON format.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
......
......@@ -194,7 +194,9 @@ def main(args):
llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
sampling_params = SamplingParams(temperature=0,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize)
print("Testing filtered requests")
prompts = repeat_and_sort_requests(filtered_requests,
......@@ -243,6 +245,12 @@ if __name__ == "__main__":
"subtract this length when filtering prompts. Only used "
"when dataset-path is not provided.",
)
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
......
......@@ -23,7 +23,7 @@ def sample_requests(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> list[tuple[str, int, int]]:
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
......@@ -71,6 +71,7 @@ def run_vllm(
requests: list[tuple[str, int, int]],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
......@@ -95,6 +96,7 @@ def run_vllm(
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
detokenize=not disable_detokenize,
))
start = time.perf_counter()
......@@ -121,7 +123,8 @@ def main(args: argparse.Namespace):
if args.backend == "vllm":
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
EngineArgs.from_cli_args(args),
args.disable_detokenize)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
......@@ -174,6 +177,12 @@ if __name__ == "__main__":
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
......
......@@ -168,6 +168,7 @@ def run_vllm(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
......@@ -194,6 +195,7 @@ def run_vllm(
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora:
......@@ -232,6 +234,7 @@ async def run_vllm_async(
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
from vllm import SamplingParams
......@@ -262,6 +265,7 @@ async def run_vllm_async(
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
lora_requests.append(request.lora_request)
......@@ -288,6 +292,7 @@ def run_hf(
n: int,
max_batch_size: int,
trust_remote_code: bool,
disable_detokenize: bool = False,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
......@@ -327,6 +332,7 @@ def run_hf(
use_cache=True,
max_new_tokens=max_output_len,
)
if not disable_detokenize:
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
pbar.update(len(batch))
......@@ -440,14 +446,17 @@ def main(args: argparse.Namespace):
args.n,
AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing,
args.disable_detokenize,
))
else:
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
EngineArgs.from_cli_args(args),
args.disable_detokenize)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.hf_max_batch_size, args.trust_remote_code)
args.hf_max_batch_size, args.trust_remote_code,
args.disable_detokenize)
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len)
......@@ -526,6 +535,11 @@ if __name__ == "__main__":
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"))
# LoRA
parser.add_argument(
"--lora-path",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment