Replace time.time() to time.perf_counter() for benchmarking. (#6178)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>

Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
6e2da515 · Lifu Huang · GitHub · e9a47f4c · 6e2da515 · 6e2da515
Unverified Commit 6e2da515 authored May 11, 2025 by Lifu Huang Committed by GitHub May 11, 2025
20 changed files
--- a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
+++ b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):

    tot_time = 0
    for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
        text_qa.run_batch(
            list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
        )
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic

    return tot_time

@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):

    tot_time = 0
    for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
        # Send a hint to cache the prefix
        text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
        # Send the batch
        text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))

-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic

    return tot_time

@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):

    all_prompts = [x for prompt_list in all_prompts for x in prompt_list]

-    tic = time.time()
+    tic = time.perf_counter()
    text_qa.run_batch(
        list(zip(all_prompts, [gen_len] * len(all_prompts))),
    )
-    tot_time = time.time() - tic
+    tot_time = time.perf_counter() - tic

    return tot_time


--- a/benchmark/benchmark_batch/benchmark_batch.py
+++ b/benchmark/benchmark_batch/benchmark_batch.py
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
    }
    data = {"text": prompts, "sampling_params": sampling_params}

-    start_time = time.time()
+    start_time = time.perf_counter()
    try:
        response = requests.post(
            endpoint.base_url + "/generate", json=data, timeout=3600
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
            error = response.json()
            raise RuntimeError(f"Request {request_id} failed: {error}")
        result = response.json()
-        elapsed_time = (time.time() - start_time) * 1000  # Convert to ms
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
        avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
        return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
    except Exception as e:
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
    num_requests = len(batched_prompts)

    # Record start time for total latency
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()

    for i, batch_prompts in enumerate(batched_prompts):
        request_id = i + 1
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
        results.append(result)

    # Calculate total latency
-    total_latency = (time.time() - benchmark_start_time) * 1000  # Convert to ms
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms

    return results, total_latency


--- a/benchmark/benchmark_batch/benchmark_tokenizer.py
+++ b/benchmark/benchmark_batch/benchmark_tokenizer.py
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
    for run in range(NUM_RUNS):
        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison

-        start_time = time.time()
+        start_time = time.perf_counter()
        for prompt in batch_prompts:
            tokens = tokenizer.encode(prompt)
-        sequential_time = (time.time() - start_time) * 1000
+        sequential_time = (time.perf_counter() - start_time) * 1000
        sequential_times.append(sequential_time)

    # Batch tokenization using tokenizer()
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
    for run in range(NUM_RUNS):
        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison

-        start_time = time.time()
+        start_time = time.perf_counter()
        tokens = tokenizer(batch_prompts)
-        batch_time = (time.time() - start_time) * 1000
+        batch_time = (time.perf_counter() - start_time) * 1000
        batch_times.append(batch_time)

    return {

--- a/benchmark/generative_agents/bench_other.py
+++ b/benchmark/generative_agents/bench_other.py
@@ -39,7 +39,7 @@ def main(args):
        answer = await call_generate(**arg, temperature=0)
        states.append(answer)

-    tic = time.time()
+    tic = time.perf_counter()
    # we always sequentially execute agent calls to maintain its dependency
    if args.backend != "lmql":
        for arg in tqdm(arguments):
@@ -50,7 +50,7 @@ def main(args):
        loop = asyncio.get_event_loop()
        for arg in tqdm(arguments):
            loop.run_until_complete(get_one_answer_async(arg))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    print(f"Latency: {latency:.3f}")


--- a/benchmark/generative_agents/bench_sglang.py
+++ b/benchmark/generative_agents/bench_sglang.py
@@ -35,14 +35,14 @@ def main(args):

    states = []
    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    for a in arguments:
        # only a single key in the dict
        for func, arg in a.items():
            result = func.run(**arg)
        result.sync()
        states.append(result)
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    print(f"Latency: {latency:.3f}")

--- a/benchmark/gsm8k/bench_other.py
+++ b/benchmark/gsm8k/bench_other.py
@@ -75,7 +75,7 @@ def main(args):
            )
            states[i] = answer

-        tic = time.time()
+        tic = time.perf_counter()
        if args.parallel == 1:
            for i in tqdm(range(len(questions))):
                get_one_answer(i)
@@ -106,9 +106,9 @@ def main(args):
                for j in range(len(rets)):
                    states[i + j] = rets[j]

-        tic = time.time()
+        tic = time.perf_counter()
        asyncio.run(batched_call(batch_size=args.parallel))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    preds = []
    for i in range(len(states)):

--- a/benchmark/gsm8k/bench_sglang.py
+++ b/benchmark/gsm8k/bench_sglang.py
@@ -84,14 +84,14 @@ def main(args):
    #####################################

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = few_shot_gsm8k.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    preds = []
    for i in range(len(states)):

--- a/benchmark/hellaswag/bench_other.py
+++ b/benchmark/hellaswag/bench_other.py
@@ -57,7 +57,7 @@ def main(args):
                context=few_shot_examples + questions[i], choices=choices[i]
            )

-        tic = time.time()
+        tic = time.perf_counter()
        if args.parallel == 1:
            for i in tqdm(range(len(questions))):
                get_one_answer(i)
@@ -82,10 +82,10 @@ def main(args):
                for j in range(len(rets)):
                    preds[i + j] = rets[j]

-        tic = time.time()
+        tic = time.perf_counter()
        asyncio.run(batched_call(batch_size=args.parallel))

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    acc = np.mean(np.array(preds) == np.array(labels))

--- a/benchmark/hellaswag/bench_sglang.py
+++ b/benchmark/hellaswag/bench_sglang.py
@@ -68,7 +68,7 @@ def main(args):
    #####################################

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    rets = few_shot_hellaswag.run_batch(
        arguments,
        temperature=0,
@@ -76,7 +76,7 @@ def main(args):
        progress_bar=True,
    )
    preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    acc = np.mean(np.array(preds) == np.array(labels))

--- a/benchmark/hicache/bench_multiturn.py
+++ b/benchmark/hicache/bench_multiturn.py
@@ -261,7 +261,7 @@ class WorkloadGenerator:
            client_id, payload = item
            response = await async_request_sglang_generate(payload, self.url, self.pbar)
            if self.pbar.n == self.pbar.total:
-                self.finished_time = time.time()
+                self.finished_time = time.perf_counter()
            self.response_queue.put((client_id, response))
        except Exception as e:
            print(f"Request failed: {e}")
@@ -334,7 +334,7 @@ class WorkloadGenerator:
        request_thread = threading.Thread(target=self.request_sender, daemon=True)
        response_thread = threading.Thread(target=self.response_handler, daemon=True)

-        self.start_time = time.time()
+        self.start_time = time.perf_counter()
        request_thread.start()
        response_thread.start()


--- a/benchmark/json_decode_regex/bench_other.py
+++ b/benchmark/json_decode_regex/bench_other.py
@@ -53,7 +53,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = json_decode(generate=call_generate, **arguments[i])

-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(arguments))):
            get_one_answer(i)
@@ -68,7 +68,7 @@ def main(args):
            for _ in rets:
                pass

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    print(f"Latency: {latency:.3f}")

--- a/benchmark/json_decode_regex/bench_sglang.py
+++ b/benchmark/json_decode_regex/bench_sglang.py
@@ -63,11 +63,11 @@ def main(args):
    json_warm_up.run().sync()

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = json_decode.run_batch(
        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    print(f"Latency: {latency:.3f}")

--- a/benchmark/json_jump_forward/bench_other.py
+++ b/benchmark/json_jump_forward/bench_other.py
@@ -175,7 +175,7 @@ def bench_character(args):
    else:
        raise ValueError(f"Invalid backend: {args.backend}")

-    tic = time.time()
+    tic = time.perf_counter()

    if args.backend != "lmql":
        if args.parallel == 1:
@@ -202,7 +202,7 @@ def bench_character(args):
                asyncio.gather(*[get_one_answer_async(i) for i in bt])
            )

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    return states, latency

@@ -236,7 +236,7 @@ def bench_city_doc(args):
    else:
        raise ValueError(f"Invalid backend: {args.backend}")

-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(arguments))):
            get_one_answer(i)
@@ -246,7 +246,7 @@ def bench_city_doc(args):
            for _ in rets:
                pass

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    return states, latency


--- a/benchmark/json_jump_forward/bench_sglang.py
+++ b/benchmark/json_jump_forward/bench_sglang.py
@@ -67,14 +67,14 @@ def bench_city_doc(args):
    sgl.set_default_backend(backend)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = city_gen.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    return states, latency

@@ -91,14 +91,14 @@ def bench_character(args):
    sgl.set_default_backend(backend)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = character_gen.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    return states, latency


--- a/benchmark/json_schema/bench_sglang.py
+++ b/benchmark/json_schema/bench_sglang.py
@@ -85,14 +85,14 @@ def bench_schema(args):
    sgl.set_default_backend(backend)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = schema_gen.run_batch(
        arguments,
        temperature=0,
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Check if the outputs are valid
    indexes = []

--- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
+++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
            ]
        print(f"Start tuning over {len(search_space)} configurations...")

-        start = time.time()
+        start = time.perf_counter()
        configs = _distribute(
            "tune",
            [
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
            use_int8_w8a16,
            block_shape,
        )
-        end = time.time()
+        end = time.perf_counter()
        print(f"Tuning took {end - start:.2f} seconds")
    else:
        outputs = _distribute(

--- a/benchmark/kernels/quantization/tuning_block_wise_kernel.py
+++ b/benchmark/kernels/quantization/tuning_block_wise_kernel.py
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
        config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
    ]

-    start = time.time()
+    start = time.perf_counter()
    results = {}
    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
        N, K = shape[0], shape[1]
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
        best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
        save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)

-    end = time.time()
+    end = time.perf_counter()
    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")



--- a/benchmark/line_retrieval/bench_sglang.py
+++ b/benchmark/line_retrieval/bench_sglang.py
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
    # Select backend
    backend = select_sglang_backend(args)

-    tic = time.time()
+    tic = time.perf_counter()
    states = line_retrieval.run_batch(
        arguments,
        temperature=0,
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    corrects = []
    for i in range(len(arguments)):

--- a/benchmark/llava_bench/bench_sglang.py
+++ b/benchmark/llava_bench/bench_sglang.py
@@ -41,7 +41,7 @@ def main(args):
    sgl.set_default_backend(backend)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm.tqdm(range(len(lines))):
            image_file = arguments[i]["image_file"]
@@ -52,7 +52,7 @@ def main(args):
        states = image_qa.run_batch(
            arguments, temperature=0, num_threads=args.parallel, progress_bar=True
        )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    print(f"Latency: {latency:.3f}")


--- a/benchmark/llm_judge/bench_other.py
+++ b/benchmark/llm_judge/bench_other.py
@@ -85,7 +85,7 @@ def main(args):
    call_generate = partial(get_call_generate(args), temperature=0)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()

    if args.backend != "lmql":

@@ -120,7 +120,7 @@ def main(args):
                asyncio.gather(*[get_one_answer_async(i) for i in bt])
            )

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    print(f"Latency: {latency:.3f}")