"src/vscode:/vscode.git/clone" did not exist on "2ff3006c35b3f637dc1fc63774a997e7172999f3"
Unverified Commit 6e2da515 authored by Lifu Huang's avatar Lifu Huang Committed by GitHub
Browse files

Replace time.time() to time.perf_counter() for benchmarking. (#6178)


Signed-off-by: default avatarLifu Huang <lifu.hlf@gmail.com>
parent e9a47f4c
......@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
tot_time = 0
for i in range(len(all_prompts)):
tic = time.time()
tic = time.perf_counter()
text_qa.run_batch(
list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
)
tot_time += time.time() - tic
tot_time += time.perf_counter() - tic
return tot_time
......@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
tot_time = 0
for i in range(len(all_prompts)):
tic = time.time()
tic = time.perf_counter()
# Send a hint to cache the prefix
text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
# Send the batch
text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
tot_time += time.time() - tic
tot_time += time.perf_counter() - tic
return tot_time
......@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
tic = time.time()
tic = time.perf_counter()
text_qa.run_batch(
list(zip(all_prompts, [gen_len] * len(all_prompts))),
)
tot_time = time.time() - tic
tot_time = time.perf_counter() - tic
return tot_time
......
......@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
}
data = {"text": prompts, "sampling_params": sampling_params}
start_time = time.time()
start_time = time.perf_counter()
try:
response = requests.post(
endpoint.base_url + "/generate", json=data, timeout=3600
......@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
error = response.json()
raise RuntimeError(f"Request {request_id} failed: {error}")
result = response.json()
elapsed_time = (time.time() - start_time) * 1000 # Convert to ms
elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms
avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
except Exception as e:
......@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
num_requests = len(batched_prompts)
# Record start time for total latency
benchmark_start_time = time.time()
benchmark_start_time = time.perf_counter()
for i, batch_prompts in enumerate(batched_prompts):
request_id = i + 1
......@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
results.append(result)
# Calculate total latency
total_latency = (time.time() - benchmark_start_time) * 1000 # Convert to ms
total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms
return results, total_latency
......
......@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for run in range(NUM_RUNS):
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
start_time = time.time()
start_time = time.perf_counter()
for prompt in batch_prompts:
tokens = tokenizer.encode(prompt)
sequential_time = (time.time() - start_time) * 1000
sequential_time = (time.perf_counter() - start_time) * 1000
sequential_times.append(sequential_time)
# Batch tokenization using tokenizer()
......@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for run in range(NUM_RUNS):
batch_prompts = prompts[:batch_size] # Use same prompts for fair comparison
start_time = time.time()
start_time = time.perf_counter()
tokens = tokenizer(batch_prompts)
batch_time = (time.time() - start_time) * 1000
batch_time = (time.perf_counter() - start_time) * 1000
batch_times.append(batch_time)
return {
......
......@@ -39,7 +39,7 @@ def main(args):
answer = await call_generate(**arg, temperature=0)
states.append(answer)
tic = time.time()
tic = time.perf_counter()
# we always sequentially execute agent calls to maintain its dependency
if args.backend != "lmql":
for arg in tqdm(arguments):
......@@ -50,7 +50,7 @@ def main(args):
loop = asyncio.get_event_loop()
for arg in tqdm(arguments):
loop.run_until_complete(get_one_answer_async(arg))
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -35,14 +35,14 @@ def main(args):
states = []
# Run requests
tic = time.time()
tic = time.perf_counter()
for a in arguments:
# only a single key in the dict
for func, arg in a.items():
result = func.run(**arg)
result.sync()
states.append(result)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -75,7 +75,7 @@ def main(args):
)
states[i] = answer
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
......@@ -106,9 +106,9 @@ def main(args):
for j in range(len(rets)):
states[i + j] = rets[j]
tic = time.time()
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
......
......@@ -84,14 +84,14 @@ def main(args):
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
states = few_shot_gsm8k.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
......
......@@ -57,7 +57,7 @@ def main(args):
context=few_shot_examples + questions[i], choices=choices[i]
)
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
......@@ -82,10 +82,10 @@ def main(args):
for j in range(len(rets)):
preds[i + j] = rets[j]
tic = time.time()
tic = time.perf_counter()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
......
......@@ -68,7 +68,7 @@ def main(args):
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
......@@ -76,7 +76,7 @@ def main(args):
progress_bar=True,
)
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
......
......@@ -261,7 +261,7 @@ class WorkloadGenerator:
client_id, payload = item
response = await async_request_sglang_generate(payload, self.url, self.pbar)
if self.pbar.n == self.pbar.total:
self.finished_time = time.time()
self.finished_time = time.perf_counter()
self.response_queue.put((client_id, response))
except Exception as e:
print(f"Request failed: {e}")
......@@ -334,7 +334,7 @@ class WorkloadGenerator:
request_thread = threading.Thread(target=self.request_sender, daemon=True)
response_thread = threading.Thread(target=self.response_handler, daemon=True)
self.start_time = time.time()
self.start_time = time.perf_counter()
request_thread.start()
response_thread.start()
......
......@@ -53,7 +53,7 @@ def main(args):
def get_one_answer(i):
states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
......@@ -68,7 +68,7 @@ def main(args):
for _ in rets:
pass
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -63,11 +63,11 @@ def main(args):
json_warm_up.run().sync()
# Run requests
tic = time.time()
tic = time.perf_counter()
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -175,7 +175,7 @@ def bench_character(args):
else:
raise ValueError(f"Invalid backend: {args.backend}")
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
if args.parallel == 1:
......@@ -202,7 +202,7 @@ def bench_character(args):
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency
......@@ -236,7 +236,7 @@ def bench_city_doc(args):
else:
raise ValueError(f"Invalid backend: {args.backend}")
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
......@@ -246,7 +246,7 @@ def bench_city_doc(args):
for _ in rets:
pass
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency
......
......@@ -67,14 +67,14 @@ def bench_city_doc(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = city_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency
......@@ -91,14 +91,14 @@ def bench_character(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = character_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
return states, latency
......
......@@ -85,14 +85,14 @@ def bench_schema(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = schema_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Check if the outputs are valid
indexes = []
......
......@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
]
print(f"Start tuning over {len(search_space)} configurations...")
start = time.time()
start = time.perf_counter()
configs = _distribute(
"tune",
[
......@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
use_int8_w8a16,
block_shape,
)
end = time.time()
end = time.perf_counter()
print(f"Tuning took {end - start:.2f} seconds")
else:
outputs = _distribute(
......
......@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
]
start = time.time()
start = time.perf_counter()
results = {}
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
N, K = shape[0], shape[1]
......@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
end = time.time()
end = time.perf_counter()
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
......
......@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
# Select backend
backend = select_sglang_backend(args)
tic = time.time()
tic = time.perf_counter()
states = line_retrieval.run_batch(
arguments,
temperature=0,
......@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
corrects = []
for i in range(len(arguments)):
......
......@@ -41,7 +41,7 @@ def main(args):
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm.tqdm(range(len(lines))):
image_file = arguments[i]["image_file"]
......@@ -52,7 +52,7 @@ def main(args):
states = image_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -85,7 +85,7 @@ def main(args):
call_generate = partial(get_call_generate(args), temperature=0)
# Run requests
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
......@@ -120,7 +120,7 @@ def main(args):
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment