Unverified Commit 6e2da515 authored by Lifu Huang's avatar Lifu Huang Committed by GitHub
Browse files

Replace time.time() to time.perf_counter() for benchmarking. (#6178)


Signed-off-by: default avatarLifu Huang <lifu.hlf@gmail.com>
parent e9a47f4c
......@@ -65,11 +65,11 @@ def main(args):
sgl.set_default_backend(select_sglang_backend(args))
# Run requests
tic = time.time()
tic = time.perf_counter()
states = suggest_tips.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
......
......@@ -138,7 +138,7 @@ def main(args):
# Run requests
states = [None] * len(questions)
tic = time.time()
tic = time.perf_counter()
if args.backend != "lmql":
def get_one_answer(i):
......@@ -177,7 +177,7 @@ def main(args):
tasks = [get_one_answer_async(k) for k in bt]
loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:
......
......@@ -119,7 +119,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = tree_search.run_batch(
arguments,
temperature=0,
......@@ -127,7 +127,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:
answers_text.append([x for xs in s.ret_value for x in xs])
......
......@@ -121,7 +121,7 @@ def main(args):
def get_one_answer(i):
states[i] = tree_search(**arguments[i], call_generate=call_generate)
tic = time.time()
tic = time.perf_counter()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
......@@ -134,7 +134,7 @@ def main(args):
)
)
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:
......
......@@ -107,7 +107,7 @@ def main(args):
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
tic = time.perf_counter()
states = tree_search.run_batch(
arguments,
temperature=0,
......@@ -115,7 +115,7 @@ def main(args):
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
latency = time.perf_counter() - tic
answers_text = []
for s in states:
answers_text.append([x for xs in s["answer"] for x in xs])
......
......@@ -90,7 +90,7 @@ def run_eval(args):
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
states = few_shot_gsm8k.run_batch(
arguments,
temperature=args.temperature if hasattr(args, "temperature") else 0,
......@@ -99,7 +99,7 @@ def run_eval(args):
return_logprob=getattr(args, "return_logprob", None),
logprob_start_len=getattr(args, "logprob_start_len", None),
)
latency = time.time() - tic
latency = time.perf_counter() - tic
preds = []
for i in range(len(states)):
......
......@@ -89,7 +89,7 @@ def run_eval(args):
}
# Run requests
tic = time.time()
tic = time.perf_counter()
loop = asyncio.get_event_loop()
......@@ -98,7 +98,7 @@ def run_eval(args):
)
# End requests
latency = time.time() - tic
latency = time.perf_counter() - tic
# Shutdown the engine
engine.shutdown()
......
......@@ -71,9 +71,9 @@ def run_eval(args):
)
# Run eval
tic = time.time()
tic = time.perf_counter()
result = eval_obj(sampler)
latency = time.time() - tic
latency = time.perf_counter() - tic
# Dump reports
metrics = result.metrics | {"score": result.score}
......
......@@ -503,7 +503,7 @@ def test_hellaswag_select():
#####################################
# Run requests
tic = time.time()
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
......@@ -514,13 +514,13 @@ def test_hellaswag_select():
preds = []
for i, ret in enumerate(rets):
preds.append(choices[i].index(ret["answer"]))
latency = time.time() - tic
latency = time.perf_counter() - tic
# Compute accuracy
accuracy = np.mean(np.array(preds) == np.array(labels))
# Test generator style of run_batch
tic = time.time()
tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch(
arguments,
temperature=0,
......@@ -531,7 +531,7 @@ def test_hellaswag_select():
preds_gen = []
for i, ret in enumerate(rets):
preds_gen.append(choices[i].index(ret["answer"]))
latency_gen = time.time() - tic
latency_gen = time.perf_counter() - tic
# Compute accuracy
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
......
......@@ -449,9 +449,9 @@ def popen_launch_server(
else:
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
start_time = time.time()
start_time = time.perf_counter()
with requests.Session() as session:
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
headers = {
"Content-Type": "application/json; charset=utf-8",
......@@ -584,7 +584,7 @@ class TestFile:
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
tic = time.time()
tic = time.perf_counter()
success = True
for i, file in enumerate(files):
......@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
flush=True,
)
tic = time.time()
tic = time.perf_counter()
process = subprocess.Popen(
["python3", filename], stdout=None, stderr=None, env=os.environ
)
process.wait()
elapsed = time.time() - tic
elapsed = time.perf_counter() - tic
print(
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
......@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
break
if success:
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
else:
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
return 0 if success else -1
......
......@@ -92,9 +92,9 @@ def popen_launch_router(
process = subprocess.Popen(command, stdout=None, stderr=None)
start_time = time.time()
start_time = time.perf_counter()
with requests.Session() as session:
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
response = session.get(f"{base_url}/health")
if response.status_code == 200:
......@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
return
process.terminate()
start_time = time.time()
start_time = time.perf_counter()
while process.poll() is None:
print(f"Terminating process {process.pid}")
if time.time() - start_time > timeout:
if time.perf_counter() - start_time > timeout:
raise TimeoutError(
f"Process {process.pid} failed to terminate within {timeout}s"
)
......
......@@ -184,9 +184,9 @@ class ExperimentRunner:
self.logger = logging.getLogger(__name__)
def wait_for_server(self, port: int, timeout: int = 300) -> bool:
start_time = time.time()
start_time = time.perf_counter()
while time.time() - start_time < timeout:
while time.perf_counter() - start_time < timeout:
try:
response = requests.get(f"http://localhost:{port}/health")
if response.status_code == 200:
......@@ -197,7 +197,7 @@ class ExperimentRunner:
return False
def run_task(self, config: TaskConfig) -> TaskResult:
start_time = time.time()
start_time = time.perf_counter()
client_output = []
try:
......@@ -247,7 +247,7 @@ class ExperimentRunner:
name=config.name,
success=True,
output=formatted_output,
runtime=time.time() - start_time,
runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(),
)
......@@ -256,7 +256,7 @@ class ExperimentRunner:
name=config.name,
success=False,
output=str(e),
runtime=time.time() - start_time,
runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(),
)
......
......@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
hf_outputs = hf_runner.forward(truncated_prompts)
st_start_time = time.time()
st_start_time = time.perf_counter()
hf_outputs = hf_runner.forward(truncated_prompts)
st_end_time = time.time()
st_end_time = time.perf_counter()
with SRTRunner(
model_path,
......@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_start_time = time.time()
sgl_start_time = time.perf_counter()
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_end_time = time.time()
sgl_end_time = time.perf_counter()
transformer_time = st_end_time - st_start_time
sgl_time = sgl_end_time - sgl_start_time
......
......@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
result = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"result = `{result}`")
......@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
def test_throughput(self):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
result = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"result = `{result}`")
......
......@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
)
print("release_memory_occupation start")
t = time.time()
t = time.perf_counter()
engine.release_memory_occupation()
if _DEBUG_EXTRA:
print("release_memory_occupation", time.time() - t)
print("release_memory_occupation", time.perf_counter() - t)
if _DEBUG_EXTRA:
time.sleep(5)
......@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
time.sleep(5)
print("resume_memory_occupation start")
t = time.time()
t = time.perf_counter()
engine.resume_memory_occupation()
if _DEBUG_EXTRA:
print("resume_memory_occupation", time.time() - t)
print("resume_memory_occupation", time.perf_counter() - t)
self.assertEqual(
_try_allocate_big_tensor(),
......
......@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"{res=}")
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")
......
......@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(f"{res=}")
throughput = max_tokens / (tok - tic)
self.assertGreaterEqual(throughput, 285)
......
......@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
max_tokens = 256
tic = time.time()
tic = time.perf_counter()
res = self.run_decode(max_tokens)
tok = time.time()
tok = time.perf_counter()
print(res["text"])
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")
......
......@@ -164,7 +164,7 @@ def init_process_hf(
)
dist.barrier(group=group, device_ids=[rank])
torch.cuda.synchronize()
time_begin_broadcast = time.time()
time_begin_broadcast = time.perf_counter()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
......@@ -182,7 +182,7 @@ def init_process_hf(
group=group,
)
torch.cuda.synchronize()
time_end_broadcast = time.time()
time_end_broadcast = time.perf_counter()
# Measure the latency of broadcasting/weights update.
broadcast_time = time_end_broadcast - time_begin_broadcast
......@@ -282,7 +282,7 @@ def init_process_sgl(
)
torch.cuda.synchronize()
time_begin_update = time.time()
time_begin_update = time.perf_counter()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
......@@ -312,7 +312,7 @@ def init_process_sgl(
},
)
torch.cuda.synchronize()
time_end_update = time.time()
time_end_update = time.perf_counter()
# Measure the latency of broadcast/weights update.
update_time = time_end_update - time_begin_update
......
......@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
memory_before = torch.cuda.memory_allocated()
new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
time_start = time.time()
time_start = time.perf_counter()
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
print(f"Time delta: {time.time() - time_start:.03f}")
print(f"Time delta: {time.perf_counter() - time_start:.03f}")
for param_name in param_names[:3]:
_check_param(engine, param_name, [1.5] * 5)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment