Unverified Commit 6e2da515 authored by Lifu Huang's avatar Lifu Huang Committed by GitHub
Browse files

Replace time.time() to time.perf_counter() for benchmarking. (#6178)


Signed-off-by: default avatarLifu Huang <lifu.hlf@gmail.com>
parent e9a47f4c
......@@ -59,7 +59,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = multi_dimension_judge.run_batch(
arguments,
temperature=0,
......@@ -67,7 +67,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -45,7 +45,7 @@ def main(args):
def get_one_answer(i):
states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
......@@ -58,7 +58,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -46,11 +46,11 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -76,7 +76,7 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
preds[i] = pred.strip()[0]
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in range(len(prompts)):
get_one_answer(i)
......@@ -94,9 +94,9 @@ def evaluate(args, subject, dev_df, test_df, call_generate):
for j in range(len(rets)):
preds[i + j] = rets[j].strip()[0]
tic = time.time()
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)]
......
......@@ -116,7 +116,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run
tic = time.time()
tic = time.perf_counter()
states = few_shot_mmlu.run_batch(
arguments,
temperature=0,
......@@ -128,7 +128,7 @@ def main(args):
preds = [
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
]
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)]
......
......@@ -119,7 +119,7 @@ async def eval_mmmu(args) -> None:
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
)
semaphore = asyncio.Semaphore(args.concurrency)
start = time.time()
start = time.perf_counter()
base_url = f"http://127.0.0.1:{args.port}"
if args.profile:
......@@ -147,7 +147,7 @@ async def eval_mmmu(args) -> None:
if profile_output.success:
print("Profiler stopped")
print(f"Benchmark time: {time.time() - start}")
print(f"Benchmark time: {time.perf_counter() - start}")
args.output_path = f"./val_sglang.json"
save_json(args.output_path, out_samples)
eval_result(model_answer_path=args.output_path, answer_dict=answer_dict)
......
......@@ -66,7 +66,7 @@ def main(args):
answers[i] = cur_answers
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_answer(i)
......@@ -79,7 +79,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
......
......@@ -57,7 +57,7 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = answer_mt_bench.run_batch(
arguments,
temperature=0,
......@@ -66,7 +66,7 @@ def main(args):
progress_bar=True,
)
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
......
......@@ -68,7 +68,7 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = answer_mt_bench.run_batch(
arguments,
temperature=0,
......@@ -78,7 +78,7 @@ def main(args):
)
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic
latency = time.perf_counter() - tic
num_output_tokens = sum(
s.get_meta_info("answer_1")["completion_tokens"]
+ s.get_meta_info("answer_2")["completion_tokens"]
......
......@@ -113,7 +113,7 @@ def main(args):
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
states[i] = answer
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
......@@ -134,7 +134,7 @@ def main(args):
)
states[i] = answer
tic = time.time()
tic = time.perf_counter()
loop = asyncio.get_event_loop()
batches = [
list(range(i, min(i + args.parallel, len(questions))))
......@@ -144,7 +144,7 @@ def main(args):
tasks = [get_one_answer_asyncio(k) for k in bt]
loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
......
......@@ -90,7 +90,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = multi_chain_gsm8k.run_batch(
arguments,
temperature=0,
......@@ -98,7 +98,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
......
......@@ -61,7 +61,7 @@ def main(args):
def get_one_answer(i):
states[i] = multi_document_qa(generate=call_generate, **arguments[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(labels))):
get_one_answer(i)
......@@ -74,7 +74,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(states)
......
......@@ -49,11 +49,11 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = multi_document_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print([s["answer"] for s in states])
......
......@@ -35,7 +35,7 @@ def main(args):
def get_one_answer(i):
states[i] = multi_turns(generate=call_generate, **multi_qas[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(multi_qas))):
get_one_answer(i)
......@@ -50,7 +50,7 @@ def main(args):
for _ in rets:
pass
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -27,7 +27,7 @@ def main(args):
backend = select_sglang_backend(args)
tic = time.time()
tic = time.perf_counter()
states = multi_turns.run_batch(
multi_qas,
temperature=0,
......@@ -35,7 +35,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -84,7 +84,7 @@ def main(args):
backend = select_sglang_backend(args)
tic = time.time()
tic = time.perf_counter()
states = multi_turns.run_batch(
multi_qas,
temperature=0,
......@@ -92,7 +92,7 @@ def main(args):
num_threads="auto",
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -146,7 +146,7 @@ def main(args):
states.append(answer)
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
if args.parallel == 1:
......@@ -173,7 +173,7 @@ def main(args):
tasks = [run_single_agent_async(arg) for arg in bt]
loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -115,14 +115,14 @@ def main(args):
sgl.set_default_backend(backend)
states = []
tic = time.time()
tic = time.perf_counter()
states = webthink.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -51,7 +51,7 @@ def main(args):
)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = reasoning_gen.run_batch(
questions,
num_threads=args.parallel,
......@@ -60,7 +60,7 @@ def main(args):
max_new_tokens=32768,
top_p=0.95,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Extract results and record outcomes in a list.
outcomes = []
......
......@@ -68,7 +68,7 @@ def main(args):
call_generate = partial(get_call_generate(args), temperature=0)
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
def get_one_answer(i):
......@@ -102,7 +102,7 @@ def main(args):
loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in batch])
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment