Unverified Commit 6e2da515 authored by Lifu Huang's avatar Lifu Huang Committed by GitHub
Browse files

Replace time.time() to time.perf_counter() for benchmarking. (#6178)


Signed-off-by: default avatarLifu Huang <lifu.hlf@gmail.com>
parent e9a47f4c
...@@ -59,7 +59,7 @@ def main(args): ...@@ -59,7 +59,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = multi_dimension_judge.run_batch( states = multi_dimension_judge.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -67,7 +67,7 @@ def main(args): ...@@ -67,7 +67,7 @@ def main(args):
num_threads=args.parallel, num_threads=args.parallel,
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -45,7 +45,7 @@ def main(args): ...@@ -45,7 +45,7 @@ def main(args):
def get_one_answer(i): def get_one_answer(i):
states[i] = json_decode(generate=call_generate, **arguments[i]) states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(arguments))): for i in tqdm(range(len(arguments))):
get_one_answer(i) get_one_answer(i)
...@@ -58,7 +58,7 @@ def main(args): ...@@ -58,7 +58,7 @@ def main(args):
) )
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -46,11 +46,11 @@ def main(args): ...@@ -46,11 +46,11 @@ def main(args):
sgl.set_default_backend(backend) sgl.set_default_backend(backend)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = json_decode.run_batch( states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True arguments, temperature=0, num_threads=args.parallel, progress_bar=True
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate): ...@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens) pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
preds[i] = pred.strip()[0] preds[i] = pred.strip()[0]
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in range(len(prompts)): for i in range(len(prompts)):
get_one_answer(i) get_one_answer(i)
...@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate): ...@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
for j in range(len(rets)): for j in range(len(rets)):
preds[i + j] = rets[j].strip()[0] preds[i + j] = rets[j].strip()[0]
tic = time.time() tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel)) asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)] cors = [pred == label for pred, label in zip(preds, labels)]
......
...@@ -116,7 +116,7 @@ def main(args): ...@@ -116,7 +116,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
# Run # Run
tic = time.time() tic = time.perf_counter()
states = few_shot_mmlu.run_batch( states = few_shot_mmlu.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -128,7 +128,7 @@ def main(args): ...@@ -128,7 +128,7 @@ def main(args):
preds = [ preds = [
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
] ]
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)] cors = [pred == label for pred, label in zip(preds, labels)]
......
...@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None: ...@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None:
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1" api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
) )
semaphore = asyncio.Semaphore(args.concurrency) semaphore = asyncio.Semaphore(args.concurrency)
start = time.time() start = time.perf_counter()
base_url = f"http://127.0.0.1:{args.port}" base_url = f"http://127.0.0.1:{args.port}"
if args.profile: if args.profile:
...@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None: ...@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None:
if profile_output.success: if profile_output.success:
print("Profiler stopped") print("Profiler stopped")
print(f"Benchmark time: {time.time() - start}") print(f"Benchmark time: {time.perf_counter() - start}")
args.output_path = f"./val_sglang.json" args.output_path = f"./val_sglang.json"
save_json(args.output_path, out_samples) save_json(args.output_path, out_samples)
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict) eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
......
...@@ -66,7 +66,7 @@ def main(args): ...@@ -66,7 +66,7 @@ def main(args):
answers[i] = cur_answers answers[i] = cur_answers
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(questions))): for i in tqdm(range(len(questions))):
get_answer(i) get_answer(i)
...@@ -79,7 +79,7 @@ def main(args): ...@@ -79,7 +79,7 @@ def main(args):
) )
) )
latency = time.time() - tic latency = time.perf_counter() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}") print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
......
...@@ -57,7 +57,7 @@ def main(args): ...@@ -57,7 +57,7 @@ def main(args):
sgl.set_default_backend(backend) sgl.set_default_backend(backend)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
rets = answer_mt_bench.run_batch( rets = answer_mt_bench.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -66,7 +66,7 @@ def main(args): ...@@ -66,7 +66,7 @@ def main(args):
progress_bar=True, progress_bar=True,
) )
answers = [[s["answer_1"], s["answer_2"]] for s in rets] answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic latency = time.perf_counter() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}") print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
......
...@@ -68,7 +68,7 @@ def main(args): ...@@ -68,7 +68,7 @@ def main(args):
sgl.set_default_backend(backend) sgl.set_default_backend(backend)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
rets = answer_mt_bench.run_batch( rets = answer_mt_bench.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -78,7 +78,7 @@ def main(args): ...@@ -78,7 +78,7 @@ def main(args):
) )
answers = [[s["answer_1"], s["answer_2"]] for s in rets] answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic latency = time.perf_counter() - tic
num_output_tokens = sum( num_output_tokens = sum(
s.get_meta_info("answer_1")["completion_tokens"] s.get_meta_info("answer_1")["completion_tokens"]
+ s.get_meta_info("answer_2")["completion_tokens"] + s.get_meta_info("answer_2")["completion_tokens"]
......
...@@ -113,7 +113,7 @@ def main(args): ...@@ -113,7 +113,7 @@ def main(args):
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate) answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
states[i] = answer states[i] = answer
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(questions))): for i in tqdm(range(len(questions))):
get_one_answer(i) get_one_answer(i)
...@@ -134,7 +134,7 @@ def main(args): ...@@ -134,7 +134,7 @@ def main(args):
) )
states[i] = answer states[i] = answer
tic = time.time() tic = time.perf_counter()
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
batches = [ batches = [
list(range(i, min(i + args.parallel, len(questions)))) list(range(i, min(i + args.parallel, len(questions))))
...@@ -144,7 +144,7 @@ def main(args): ...@@ -144,7 +144,7 @@ def main(args):
tasks = [get_one_answer_asyncio(k) for k in bt] tasks = [get_one_answer_asyncio(k) for k in bt]
loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic latency = time.perf_counter() - tic
preds = [] preds = []
for i in range(len(states)): for i in range(len(states)):
......
...@@ -90,7 +90,7 @@ def main(args): ...@@ -90,7 +90,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = multi_chain_gsm8k.run_batch( states = multi_chain_gsm8k.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -98,7 +98,7 @@ def main(args): ...@@ -98,7 +98,7 @@ def main(args):
num_threads=args.parallel, num_threads=args.parallel,
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
preds = [] preds = []
for i in range(len(states)): for i in range(len(states)):
......
...@@ -61,7 +61,7 @@ def main(args): ...@@ -61,7 +61,7 @@ def main(args):
def get_one_answer(i): def get_one_answer(i):
states[i] = multi_document_qa(generate=call_generate, **arguments[i]) states[i] = multi_document_qa(generate=call_generate, **arguments[i])
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(labels))): for i in tqdm(range(len(labels))):
get_one_answer(i) get_one_answer(i)
...@@ -74,7 +74,7 @@ def main(args): ...@@ -74,7 +74,7 @@ def main(args):
) )
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(states) print(states)
......
...@@ -49,11 +49,11 @@ def main(args): ...@@ -49,11 +49,11 @@ def main(args):
sgl.set_default_backend(backend) sgl.set_default_backend(backend)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = multi_document_qa.run_batch( states = multi_document_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True arguments, temperature=0, num_threads=args.parallel, progress_bar=True
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print([s["answer"] for s in states]) print([s["answer"] for s in states])
......
...@@ -35,7 +35,7 @@ def main(args): ...@@ -35,7 +35,7 @@ def main(args):
def get_one_answer(i): def get_one_answer(i):
states[i] = multi_turns(generate=call_generate, **multi_qas[i]) states[i] = multi_turns(generate=call_generate, **multi_qas[i])
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(multi_qas))): for i in tqdm(range(len(multi_qas))):
get_one_answer(i) get_one_answer(i)
...@@ -50,7 +50,7 @@ def main(args): ...@@ -50,7 +50,7 @@ def main(args):
for _ in rets: for _ in rets:
pass pass
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -27,7 +27,7 @@ def main(args): ...@@ -27,7 +27,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
tic = time.time() tic = time.perf_counter()
states = multi_turns.run_batch( states = multi_turns.run_batch(
multi_qas, multi_qas,
temperature=0, temperature=0,
...@@ -35,7 +35,7 @@ def main(args): ...@@ -35,7 +35,7 @@ def main(args):
num_threads=args.parallel, num_threads=args.parallel,
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -84,7 +84,7 @@ def main(args): ...@@ -84,7 +84,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
tic = time.time() tic = time.perf_counter()
states = multi_turns.run_batch( states = multi_turns.run_batch(
multi_qas, multi_qas,
temperature=0, temperature=0,
...@@ -92,7 +92,7 @@ def main(args): ...@@ -92,7 +92,7 @@ def main(args):
num_threads="auto", num_threads="auto",
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -146,7 +146,7 @@ def main(args): ...@@ -146,7 +146,7 @@ def main(args):
states.append(answer) states.append(answer)
tic = time.time() tic = time.perf_counter()
if args.backend != "lmql": if args.backend != "lmql":
if args.parallel == 1: if args.parallel == 1:
...@@ -173,7 +173,7 @@ def main(args): ...@@ -173,7 +173,7 @@ def main(args):
tasks = [run_single_agent_async(arg) for arg in bt] tasks = [run_single_agent_async(arg) for arg in bt]
loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -115,14 +115,14 @@ def main(args): ...@@ -115,14 +115,14 @@ def main(args):
sgl.set_default_backend(backend) sgl.set_default_backend(backend)
states = [] states = []
tic = time.time() tic = time.perf_counter()
states = webthink.run_batch( states = webthink.run_batch(
arguments, arguments,
temperature=0, temperature=0,
num_threads=args.parallel, num_threads=args.parallel,
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -51,7 +51,7 @@ def main(args): ...@@ -51,7 +51,7 @@ def main(args):
) )
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = reasoning_gen.run_batch( states = reasoning_gen.run_batch(
questions, questions,
num_threads=args.parallel, num_threads=args.parallel,
...@@ -60,7 +60,7 @@ def main(args): ...@@ -60,7 +60,7 @@ def main(args):
max_new_tokens=32768, max_new_tokens=32768,
top_p=0.95, top_p=0.95,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Extract results and record outcomes in a list. # Extract results and record outcomes in a list.
outcomes = [] outcomes = []
......
...@@ -68,7 +68,7 @@ def main(args): ...@@ -68,7 +68,7 @@ def main(args):
call_generate = partial(get_call_generate(args), temperature=0) call_generate = partial(get_call_generate(args), temperature=0)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
if args.backend != "lmql": if args.backend != "lmql":
def get_one_answer(i): def get_one_answer(i):
...@@ -102,7 +102,7 @@ def main(args): ...@@ -102,7 +102,7 @@ def main(args):
loop.run_until_complete( loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in batch]) asyncio.gather(*[get_one_answer_async(i) for i in batch])
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment