Replace time.time() to time.perf_counter() for benchmarking. (#6178)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>

Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
6e2da515 · Lifu Huang · GitHub · e9a47f4c · 6e2da515 · 6e2da515
Unverified Commit 6e2da515 authored May 11, 2025 by Lifu Huang Committed by GitHub May 11, 2025
20 changed files
--- a/benchmark/tip_suggestion/bench_sglang.py
+++ b/benchmark/tip_suggestion/bench_sglang.py
@@ -65,11 +65,11 @@ def main(args):
    sgl.set_default_backend(select_sglang_backend(args))

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = suggest_tips.run_batch(
        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    print(f"Latency: {latency:.3f}")

--- a/benchmark/tree_of_thought_deep/bench_other.py
+++ b/benchmark/tree_of_thought_deep/bench_other.py
@@ -138,7 +138,7 @@ def main(args):
    # Run requests
    states = [None] * len(questions)

-    tic = time.time()
+    tic = time.perf_counter()
    if args.backend != "lmql":

        def get_one_answer(i):
@@ -177,7 +177,7 @@ def main(args):
            tasks = [get_one_answer_async(k) for k in bt]
            loop.run_until_complete(asyncio.gather(*tasks))

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    answers_text = []
    for s in states:

--- a/benchmark/tree_of_thought_deep/bench_sglang.py
+++ b/benchmark/tree_of_thought_deep/bench_sglang.py
@@ -119,7 +119,7 @@ def main(args):
    backend = select_sglang_backend(args)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = tree_search.run_batch(
        arguments,
        temperature=0,
@@ -127,7 +127,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    answers_text = []
    for s in states:
        answers_text.append([x for xs in s.ret_value for x in xs])

--- a/benchmark/tree_of_thought_v0/bench_other.py
+++ b/benchmark/tree_of_thought_v0/bench_other.py
@@ -121,7 +121,7 @@ def main(args):
    def get_one_answer(i):
        states[i] = tree_search(**arguments[i], call_generate=call_generate)

-    tic = time.time()
+    tic = time.perf_counter()
    if args.parallel == 1:
        for i in tqdm(range(len(questions))):
            get_one_answer(i)
@@ -134,7 +134,7 @@ def main(args):
                )
            )

-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    answers_text = []
    for s in states:

--- a/benchmark/tree_of_thought_v0/bench_sglang.py
+++ b/benchmark/tree_of_thought_v0/bench_sglang.py
@@ -107,7 +107,7 @@ def main(args):
    backend = select_sglang_backend(args)

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = tree_search.run_batch(
        arguments,
        temperature=0,
@@ -115,7 +115,7 @@ def main(args):
        num_threads=args.parallel,
        progress_bar=True,
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
    answers_text = []
    for s in states:
        answers_text.append([x for xs in s["answer"] for x in xs])

--- a/python/sglang/test/few_shot_gsm8k.py
+++ b/python/sglang/test/few_shot_gsm8k.py
@@ -90,7 +90,7 @@ def run_eval(args):
    #####################################

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    states = few_shot_gsm8k.run_batch(
        arguments,
        temperature=args.temperature if hasattr(args, "temperature") else 0,
@@ -99,7 +99,7 @@ def run_eval(args):
        return_logprob=getattr(args, "return_logprob", None),
        logprob_start_len=getattr(args, "logprob_start_len", None),
    )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    preds = []
    for i in range(len(states)):

--- a/python/sglang/test/few_shot_gsm8k_engine.py
+++ b/python/sglang/test/few_shot_gsm8k_engine.py
@@ -89,7 +89,7 @@ def run_eval(args):
    }

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()

    loop = asyncio.get_event_loop()

@@ -98,7 +98,7 @@ def run_eval(args):
    )

    # End requests
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Shutdown the engine
    engine.shutdown()

--- a/python/sglang/test/run_eval.py
+++ b/python/sglang/test/run_eval.py
@@ -71,9 +71,9 @@ def run_eval(args):
    )

    # Run eval
-    tic = time.time()
+    tic = time.perf_counter()
    result = eval_obj(sampler)
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Dump reports
    metrics = result.metrics | {"score": result.score}

--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
@@ -503,7 +503,7 @@ def test_hellaswag_select():
    #####################################

    # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
    rets = few_shot_hellaswag.run_batch(
        arguments,
        temperature=0,
@@ -514,13 +514,13 @@ def test_hellaswag_select():
    preds = []
    for i, ret in enumerate(rets):
        preds.append(choices[i].index(ret["answer"]))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic

    # Compute accuracy
    accuracy = np.mean(np.array(preds) == np.array(labels))

    # Test generator style of run_batch
-    tic = time.time()
+    tic = time.perf_counter()
    rets = few_shot_hellaswag.run_batch(
        arguments,
        temperature=0,
@@ -531,7 +531,7 @@ def test_hellaswag_select():
    preds_gen = []
    for i, ret in enumerate(rets):
        preds_gen.append(choices[i].index(ret["answer"]))
-    latency_gen = time.time() - tic
+    latency_gen = time.perf_counter() - tic

    # Compute accuracy
    accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))

--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -449,9 +449,9 @@ def popen_launch_server(
    else:
        process = subprocess.Popen(command, stdout=None, stderr=None, env=env)

-    start_time = time.time()
+    start_time = time.perf_counter()
    with requests.Session() as session:
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                headers = {
                    "Content-Type": "application/json; charset=utf-8",
@@ -584,7 +584,7 @@ class TestFile:


 def run_unittest_files(files: List[TestFile], timeout_per_file: float):
-    tic = time.time()
+    tic = time.perf_counter()
    success = True

    for i, file in enumerate(files):
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
                f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
                flush=True,
            )
-            tic = time.time()
+            tic = time.perf_counter()

            process = subprocess.Popen(
                ["python3", filename], stdout=None, stderr=None, env=os.environ
            )
            process.wait()
-            elapsed = time.time() - tic
+            elapsed = time.perf_counter() - tic

            print(
                f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
            break

    if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
    else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
+        print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)

    return 0 if success else -1


--- a/sgl-router/py_test/test_launch_server.py
+++ b/sgl-router/py_test/test_launch_server.py
@@ -92,9 +92,9 @@ def popen_launch_router(

    process = subprocess.Popen(command, stdout=None, stderr=None)

-    start_time = time.time()
+    start_time = time.perf_counter()
    with requests.Session() as session:
-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                response = session.get(f"{base_url}/health")
                if response.status_code == 200:
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
        return

    process.terminate()
-    start_time = time.time()
+    start_time = time.perf_counter()

    while process.poll() is None:
        print(f"Terminating process {process.pid}")
-        if time.time() - start_time > timeout:
+        if time.perf_counter() - start_time > timeout:
            raise TimeoutError(
                f"Process {process.pid} failed to terminate within {timeout}s"
            )

--- a/test/srt/experiment_runner.py
+++ b/test/srt/experiment_runner.py
@@ -184,9 +184,9 @@ class ExperimentRunner:
        self.logger = logging.getLogger(__name__)

    def wait_for_server(self, port: int, timeout: int = 300) -> bool:
-        start_time = time.time()
+        start_time = time.perf_counter()

-        while time.time() - start_time < timeout:
+        while time.perf_counter() - start_time < timeout:
            try:
                response = requests.get(f"http://localhost:{port}/health")
                if response.status_code == 200:
@@ -197,7 +197,7 @@ class ExperimentRunner:
        return False

    def run_task(self, config: TaskConfig) -> TaskResult:
-        start_time = time.time()
+        start_time = time.perf_counter()
        client_output = []

        try:
@@ -247,7 +247,7 @@ class ExperimentRunner:
                name=config.name,
                success=True,
                output=formatted_output,
-                runtime=time.time() - start_time,
+                runtime=time.perf_counter() - start_time,
                timestamp=datetime.now().isoformat(),
            )

@@ -256,7 +256,7 @@ class ExperimentRunner:
                name=config.name,
                success=False,
                output=str(e),
-                runtime=time.time() - start_time,
+                runtime=time.perf_counter() - start_time,
                timestamp=datetime.now().isoformat(),
            )


--- a/test/srt/models/test_encoder_embedding_models.py
+++ b/test/srt/models/test_encoder_embedding_models.py
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            # warm up
            hf_outputs = hf_runner.forward(truncated_prompts)

-            st_start_time = time.time()
+            st_start_time = time.perf_counter()
            hf_outputs = hf_runner.forward(truncated_prompts)
-            st_end_time = time.time()
+            st_end_time = time.perf_counter()

        with SRTRunner(
            model_path,
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
            # warm up
            srt_outputs = srt_runner.forward(truncated_prompts)

-            sgl_start_time = time.time()
+            sgl_start_time = time.perf_counter()
            srt_outputs = srt_runner.forward(truncated_prompts)
-            sgl_end_time = time.time()
+            sgl_end_time = time.perf_counter()

        transformer_time = st_end_time - st_start_time
        sgl_time = sgl_end_time - sgl_start_time

--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        result = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()

        print(f"result = `{result}`")

@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
    def test_throughput(self):
        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        result = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()

        print(f"result = `{result}`")


--- a/test/srt/test_release_memory_occupation.py
+++ b/test/srt/test_release_memory_occupation.py
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
        )

        print("release_memory_occupation start")
-        t = time.time()
+        t = time.perf_counter()
        engine.release_memory_occupation()
        if _DEBUG_EXTRA:
-            print("release_memory_occupation", time.time() - t)
+            print("release_memory_occupation", time.perf_counter() - t)

        if _DEBUG_EXTRA:
            time.sleep(5)
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
            time.sleep(5)

        print("resume_memory_occupation start")
-        t = time.time()
+        t = time.perf_counter()
        engine.resume_memory_occupation()
        if _DEBUG_EXTRA:
-            print("resume_memory_occupation", time.time() - t)
+            print("resume_memory_occupation", time.perf_counter() - t)

        self.assertEqual(
            _try_allocate_big_tensor(),

--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
        res = self.run_decode(16)

        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"{res=}")
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")

--- a/test/srt/test_torch_compile_moe.py
+++ b/test/srt/test_torch_compile_moe.py
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
        res = self.run_decode(16)

        max_tokens = 256
-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(f"{res=}")
        throughput = max_tokens / (tok - tic)
        self.assertGreaterEqual(throughput, 285)

--- a/test/srt/test_torchao.py
+++ b/test/srt/test_torchao.py
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):

        max_tokens = 256

-        tic = time.time()
+        tic = time.perf_counter()
        res = self.run_decode(max_tokens)
-        tok = time.time()
+        tok = time.perf_counter()
        print(res["text"])
        throughput = max_tokens / (tok - tic)
        print(f"Throughput: {throughput} tokens/s")

--- a/test/srt/test_update_weights_from_distributed.py
+++ b/test/srt/test_update_weights_from_distributed.py
@@ -164,7 +164,7 @@ def init_process_hf(
    )
    dist.barrier(group=group, device_ids=[rank])
    torch.cuda.synchronize()
-    time_begin_broadcast = time.time()
+    time_begin_broadcast = time.perf_counter()

    # The last parameter is lm_head.weight, which is tied
    # with embed_tokens.weight. Actually, we only need
@@ -182,7 +182,7 @@ def init_process_hf(
            group=group,
        )
    torch.cuda.synchronize()
-    time_end_broadcast = time.time()
+    time_end_broadcast = time.perf_counter()

    # Measure the latency of broadcasting/weights update.
    broadcast_time = time_end_broadcast - time_begin_broadcast
@@ -282,7 +282,7 @@ def init_process_sgl(
        )

    torch.cuda.synchronize()
-    time_begin_update = time.time()
+    time_begin_update = time.perf_counter()

    # The last parameter is lm_head.weight, which is tied
    # with embed_tokens.weight. Actually, we only need
@@ -312,7 +312,7 @@ def init_process_sgl(
                },
            )
    torch.cuda.synchronize()
-    time_end_update = time.time()
+    time_end_update = time.perf_counter()

    # Measure the latency of broadcast/weights update.
    update_time = time_end_update - time_begin_update

--- a/test/srt/test_update_weights_from_tensor.py
+++ b/test/srt/test_update_weights_from_tensor.py
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
    memory_before = torch.cuda.memory_allocated()
    new_tensor = torch.full((16384, 2048), 1.5, device="cuda")

-    time_start = time.time()
+    time_start = time.perf_counter()
    engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
-    print(f"Time delta: {time.time() - time_start:.03f}")
+    print(f"Time delta: {time.perf_counter() - time_start:.03f}")

    for param_name in param_names[:3]:
        _check_param(engine, param_name, [1.5] * 5)