Unverified Commit 6e2da515 authored by Lifu Huang's avatar Lifu Huang Committed by GitHub
Browse files

Replace time.time() to time.perf_counter() for benchmarking. (#6178)


Signed-off-by: default avatarLifu Huang <lifu.hlf@gmail.com>
parent e9a47f4c
...@@ -65,11 +65,11 @@ def main(args): ...@@ -65,11 +65,11 @@ def main(args):
sgl.set_default_backend(select_sglang_backend(args)) sgl.set_default_backend(select_sglang_backend(args))
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = suggest_tips.run_batch( states = suggest_tips.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True arguments, temperature=0, num_threads=args.parallel, progress_bar=True
) )
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -138,7 +138,7 @@ def main(args): ...@@ -138,7 +138,7 @@ def main(args):
# Run requests # Run requests
states = [None] * len(questions) states = [None] * len(questions)
tic = time.time() tic = time.perf_counter()
if args.backend != "lmql": if args.backend != "lmql":
def get_one_answer(i): def get_one_answer(i):
...@@ -177,7 +177,7 @@ def main(args): ...@@ -177,7 +177,7 @@ def main(args):
tasks = [get_one_answer_async(k) for k in bt] tasks = [get_one_answer_async(k) for k in bt]
loop.run_until_complete(asyncio.gather(*tasks)) loop.run_until_complete(asyncio.gather(*tasks))
latency = time.time() - tic latency = time.perf_counter() - tic
answers_text = [] answers_text = []
for s in states: for s in states:
......
...@@ -119,7 +119,7 @@ def main(args): ...@@ -119,7 +119,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = tree_search.run_batch( states = tree_search.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -127,7 +127,7 @@ def main(args): ...@@ -127,7 +127,7 @@ def main(args):
num_threads=args.parallel, num_threads=args.parallel,
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
answers_text = [] answers_text = []
for s in states: for s in states:
answers_text.append([x for xs in s.ret_value for x in xs]) answers_text.append([x for xs in s.ret_value for x in xs])
......
...@@ -121,7 +121,7 @@ def main(args): ...@@ -121,7 +121,7 @@ def main(args):
def get_one_answer(i): def get_one_answer(i):
states[i] = tree_search(**arguments[i], call_generate=call_generate) states[i] = tree_search(**arguments[i], call_generate=call_generate)
tic = time.time() tic = time.perf_counter()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(questions))): for i in tqdm(range(len(questions))):
get_one_answer(i) get_one_answer(i)
...@@ -134,7 +134,7 @@ def main(args): ...@@ -134,7 +134,7 @@ def main(args):
) )
) )
latency = time.time() - tic latency = time.perf_counter() - tic
answers_text = [] answers_text = []
for s in states: for s in states:
......
...@@ -107,7 +107,7 @@ def main(args): ...@@ -107,7 +107,7 @@ def main(args):
backend = select_sglang_backend(args) backend = select_sglang_backend(args)
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = tree_search.run_batch( states = tree_search.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -115,7 +115,7 @@ def main(args): ...@@ -115,7 +115,7 @@ def main(args):
num_threads=args.parallel, num_threads=args.parallel,
progress_bar=True, progress_bar=True,
) )
latency = time.time() - tic latency = time.perf_counter() - tic
answers_text = [] answers_text = []
for s in states: for s in states:
answers_text.append([x for xs in s["answer"] for x in xs]) answers_text.append([x for xs in s["answer"] for x in xs])
......
...@@ -90,7 +90,7 @@ def run_eval(args): ...@@ -90,7 +90,7 @@ def run_eval(args):
##################################### #####################################
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
states = few_shot_gsm8k.run_batch( states = few_shot_gsm8k.run_batch(
arguments, arguments,
temperature=args.temperature if hasattr(args, "temperature") else 0, temperature=args.temperature if hasattr(args, "temperature") else 0,
...@@ -99,7 +99,7 @@ def run_eval(args): ...@@ -99,7 +99,7 @@ def run_eval(args):
return_logprob=getattr(args, "return_logprob", None), return_logprob=getattr(args, "return_logprob", None),
logprob_start_len=getattr(args, "logprob_start_len", None), logprob_start_len=getattr(args, "logprob_start_len", None),
) )
latency = time.time() - tic latency = time.perf_counter() - tic
preds = [] preds = []
for i in range(len(states)): for i in range(len(states)):
......
...@@ -89,7 +89,7 @@ def run_eval(args): ...@@ -89,7 +89,7 @@ def run_eval(args):
} }
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
...@@ -98,7 +98,7 @@ def run_eval(args): ...@@ -98,7 +98,7 @@ def run_eval(args):
) )
# End requests # End requests
latency = time.time() - tic latency = time.perf_counter() - tic
# Shutdown the engine # Shutdown the engine
engine.shutdown() engine.shutdown()
......
...@@ -71,9 +71,9 @@ def run_eval(args): ...@@ -71,9 +71,9 @@ def run_eval(args):
) )
# Run eval # Run eval
tic = time.time() tic = time.perf_counter()
result = eval_obj(sampler) result = eval_obj(sampler)
latency = time.time() - tic latency = time.perf_counter() - tic
# Dump reports # Dump reports
metrics = result.metrics | {"score": result.score} metrics = result.metrics | {"score": result.score}
......
...@@ -503,7 +503,7 @@ def test_hellaswag_select(): ...@@ -503,7 +503,7 @@ def test_hellaswag_select():
##################################### #####################################
# Run requests # Run requests
tic = time.time() tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch( rets = few_shot_hellaswag.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -514,13 +514,13 @@ def test_hellaswag_select(): ...@@ -514,13 +514,13 @@ def test_hellaswag_select():
preds = [] preds = []
for i, ret in enumerate(rets): for i, ret in enumerate(rets):
preds.append(choices[i].index(ret["answer"])) preds.append(choices[i].index(ret["answer"]))
latency = time.time() - tic latency = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
accuracy = np.mean(np.array(preds) == np.array(labels)) accuracy = np.mean(np.array(preds) == np.array(labels))
# Test generator style of run_batch # Test generator style of run_batch
tic = time.time() tic = time.perf_counter()
rets = few_shot_hellaswag.run_batch( rets = few_shot_hellaswag.run_batch(
arguments, arguments,
temperature=0, temperature=0,
...@@ -531,7 +531,7 @@ def test_hellaswag_select(): ...@@ -531,7 +531,7 @@ def test_hellaswag_select():
preds_gen = [] preds_gen = []
for i, ret in enumerate(rets): for i, ret in enumerate(rets):
preds_gen.append(choices[i].index(ret["answer"])) preds_gen.append(choices[i].index(ret["answer"]))
latency_gen = time.time() - tic latency_gen = time.perf_counter() - tic
# Compute accuracy # Compute accuracy
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels)) accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
......
...@@ -449,9 +449,9 @@ def popen_launch_server( ...@@ -449,9 +449,9 @@ def popen_launch_server(
else: else:
process = subprocess.Popen(command, stdout=None, stderr=None, env=env) process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
start_time = time.time() start_time = time.perf_counter()
with requests.Session() as session: with requests.Session() as session:
while time.time() - start_time < timeout: while time.perf_counter() - start_time < timeout:
try: try:
headers = { headers = {
"Content-Type": "application/json; charset=utf-8", "Content-Type": "application/json; charset=utf-8",
...@@ -584,7 +584,7 @@ class TestFile: ...@@ -584,7 +584,7 @@ class TestFile:
def run_unittest_files(files: List[TestFile], timeout_per_file: float): def run_unittest_files(files: List[TestFile], timeout_per_file: float):
tic = time.time() tic = time.perf_counter()
success = True success = True
for i, file in enumerate(files): for i, file in enumerate(files):
...@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): ...@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n", f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
flush=True, flush=True,
) )
tic = time.time() tic = time.perf_counter()
process = subprocess.Popen( process = subprocess.Popen(
["python3", filename], stdout=None, stderr=None, env=os.environ ["python3", filename], stdout=None, stderr=None, env=os.environ
) )
process.wait() process.wait()
elapsed = time.time() - tic elapsed = time.perf_counter() - tic
print( print(
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n", f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
...@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): ...@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
break break
if success: if success:
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True) print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
else: else:
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True) print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
return 0 if success else -1 return 0 if success else -1
......
...@@ -92,9 +92,9 @@ def popen_launch_router( ...@@ -92,9 +92,9 @@ def popen_launch_router(
process = subprocess.Popen(command, stdout=None, stderr=None) process = subprocess.Popen(command, stdout=None, stderr=None)
start_time = time.time() start_time = time.perf_counter()
with requests.Session() as session: with requests.Session() as session:
while time.time() - start_time < timeout: while time.perf_counter() - start_time < timeout:
try: try:
response = session.get(f"{base_url}/health") response = session.get(f"{base_url}/health")
if response.status_code == 200: if response.status_code == 200:
...@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300): ...@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
return return
process.terminate() process.terminate()
start_time = time.time() start_time = time.perf_counter()
while process.poll() is None: while process.poll() is None:
print(f"Terminating process {process.pid}") print(f"Terminating process {process.pid}")
if time.time() - start_time > timeout: if time.perf_counter() - start_time > timeout:
raise TimeoutError( raise TimeoutError(
f"Process {process.pid} failed to terminate within {timeout}s" f"Process {process.pid} failed to terminate within {timeout}s"
) )
......
...@@ -184,9 +184,9 @@ class ExperimentRunner: ...@@ -184,9 +184,9 @@ class ExperimentRunner:
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
def wait_for_server(self, port: int, timeout: int = 300) -> bool: def wait_for_server(self, port: int, timeout: int = 300) -> bool:
start_time = time.time() start_time = time.perf_counter()
while time.time() - start_time < timeout: while time.perf_counter() - start_time < timeout:
try: try:
response = requests.get(f"http://localhost:{port}/health") response = requests.get(f"http://localhost:{port}/health")
if response.status_code == 200: if response.status_code == 200:
...@@ -197,7 +197,7 @@ class ExperimentRunner: ...@@ -197,7 +197,7 @@ class ExperimentRunner:
return False return False
def run_task(self, config: TaskConfig) -> TaskResult: def run_task(self, config: TaskConfig) -> TaskResult:
start_time = time.time() start_time = time.perf_counter()
client_output = [] client_output = []
try: try:
...@@ -247,7 +247,7 @@ class ExperimentRunner: ...@@ -247,7 +247,7 @@ class ExperimentRunner:
name=config.name, name=config.name,
success=True, success=True,
output=formatted_output, output=formatted_output,
runtime=time.time() - start_time, runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(), timestamp=datetime.now().isoformat(),
) )
...@@ -256,7 +256,7 @@ class ExperimentRunner: ...@@ -256,7 +256,7 @@ class ExperimentRunner:
name=config.name, name=config.name,
success=False, success=False,
output=str(e), output=str(e),
runtime=time.time() - start_time, runtime=time.perf_counter() - start_time,
timestamp=datetime.now().isoformat(), timestamp=datetime.now().isoformat(),
) )
......
...@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase): ...@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up # warm up
hf_outputs = hf_runner.forward(truncated_prompts) hf_outputs = hf_runner.forward(truncated_prompts)
st_start_time = time.time() st_start_time = time.perf_counter()
hf_outputs = hf_runner.forward(truncated_prompts) hf_outputs = hf_runner.forward(truncated_prompts)
st_end_time = time.time() st_end_time = time.perf_counter()
with SRTRunner( with SRTRunner(
model_path, model_path,
...@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase): ...@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up # warm up
srt_outputs = srt_runner.forward(truncated_prompts) srt_outputs = srt_runner.forward(truncated_prompts)
sgl_start_time = time.time() sgl_start_time = time.perf_counter()
srt_outputs = srt_runner.forward(truncated_prompts) srt_outputs = srt_runner.forward(truncated_prompts)
sgl_end_time = time.time() sgl_end_time = time.perf_counter()
transformer_time = st_end_time - st_start_time transformer_time = st_end_time - st_start_time
sgl_time = sgl_end_time - sgl_start_time sgl_time = sgl_end_time - sgl_start_time
......
...@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase): ...@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
def test_throughput(self): def test_throughput(self):
max_tokens = 256 max_tokens = 256
tic = time.time() tic = time.perf_counter()
result = self.run_decode(max_tokens) result = self.run_decode(max_tokens)
tok = time.time() tok = time.perf_counter()
print(f"result = `{result}`") print(f"result = `{result}`")
...@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase): ...@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
def test_throughput(self): def test_throughput(self):
max_tokens = 256 max_tokens = 256
tic = time.time() tic = time.perf_counter()
result = self.run_decode(max_tokens) result = self.run_decode(max_tokens)
tok = time.time() tok = time.perf_counter()
print(f"result = `{result}`") print(f"result = `{result}`")
......
...@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase): ...@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
) )
print("release_memory_occupation start") print("release_memory_occupation start")
t = time.time() t = time.perf_counter()
engine.release_memory_occupation() engine.release_memory_occupation()
if _DEBUG_EXTRA: if _DEBUG_EXTRA:
print("release_memory_occupation", time.time() - t) print("release_memory_occupation", time.perf_counter() - t)
if _DEBUG_EXTRA: if _DEBUG_EXTRA:
time.sleep(5) time.sleep(5)
...@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase): ...@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
time.sleep(5) time.sleep(5)
print("resume_memory_occupation start") print("resume_memory_occupation start")
t = time.time() t = time.perf_counter()
engine.resume_memory_occupation() engine.resume_memory_occupation()
if _DEBUG_EXTRA: if _DEBUG_EXTRA:
print("resume_memory_occupation", time.time() - t) print("resume_memory_occupation", time.perf_counter() - t)
self.assertEqual( self.assertEqual(
_try_allocate_big_tensor(), _try_allocate_big_tensor(),
......
...@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase): ...@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
res = self.run_decode(16) res = self.run_decode(16)
max_tokens = 256 max_tokens = 256
tic = time.time() tic = time.perf_counter()
res = self.run_decode(max_tokens) res = self.run_decode(max_tokens)
tok = time.time() tok = time.perf_counter()
print(f"{res=}") print(f"{res=}")
throughput = max_tokens / (tok - tic) throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s") print(f"Throughput: {throughput} tokens/s")
......
...@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase): ...@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
res = self.run_decode(16) res = self.run_decode(16)
max_tokens = 256 max_tokens = 256
tic = time.time() tic = time.perf_counter()
res = self.run_decode(max_tokens) res = self.run_decode(max_tokens)
tok = time.time() tok = time.perf_counter()
print(f"{res=}") print(f"{res=}")
throughput = max_tokens / (tok - tic) throughput = max_tokens / (tok - tic)
self.assertGreaterEqual(throughput, 285) self.assertGreaterEqual(throughput, 285)
......
...@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase): ...@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
max_tokens = 256 max_tokens = 256
tic = time.time() tic = time.perf_counter()
res = self.run_decode(max_tokens) res = self.run_decode(max_tokens)
tok = time.time() tok = time.perf_counter()
print(res["text"]) print(res["text"])
throughput = max_tokens / (tok - tic) throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s") print(f"Throughput: {throughput} tokens/s")
......
...@@ -164,7 +164,7 @@ def init_process_hf( ...@@ -164,7 +164,7 @@ def init_process_hf(
) )
dist.barrier(group=group, device_ids=[rank]) dist.barrier(group=group, device_ids=[rank])
torch.cuda.synchronize() torch.cuda.synchronize()
time_begin_broadcast = time.time() time_begin_broadcast = time.perf_counter()
# The last parameter is lm_head.weight, which is tied # The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need # with embed_tokens.weight. Actually, we only need
...@@ -182,7 +182,7 @@ def init_process_hf( ...@@ -182,7 +182,7 @@ def init_process_hf(
group=group, group=group,
) )
torch.cuda.synchronize() torch.cuda.synchronize()
time_end_broadcast = time.time() time_end_broadcast = time.perf_counter()
# Measure the latency of broadcasting/weights update. # Measure the latency of broadcasting/weights update.
broadcast_time = time_end_broadcast - time_begin_broadcast broadcast_time = time_end_broadcast - time_begin_broadcast
...@@ -282,7 +282,7 @@ def init_process_sgl( ...@@ -282,7 +282,7 @@ def init_process_sgl(
) )
torch.cuda.synchronize() torch.cuda.synchronize()
time_begin_update = time.time() time_begin_update = time.perf_counter()
# The last parameter is lm_head.weight, which is tied # The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need # with embed_tokens.weight. Actually, we only need
...@@ -312,7 +312,7 @@ def init_process_sgl( ...@@ -312,7 +312,7 @@ def init_process_sgl(
}, },
) )
torch.cuda.synchronize() torch.cuda.synchronize()
time_end_update = time.time() time_end_update = time.perf_counter()
# Measure the latency of broadcast/weights update. # Measure the latency of broadcast/weights update.
update_time = time_end_update - time_begin_update update_time = time_end_update - time_begin_update
......
...@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size): ...@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
memory_before = torch.cuda.memory_allocated() memory_before = torch.cuda.memory_allocated()
new_tensor = torch.full((16384, 2048), 1.5, device="cuda") new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
time_start = time.time() time_start = time.perf_counter()
engine.update_weights_from_tensor([(x, new_tensor) for x in param_names]) engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
print(f"Time delta: {time.time() - time_start:.03f}") print(f"Time delta: {time.perf_counter() - time_start:.03f}")
for param_name in param_names[:3]: for param_name in param_names[:3]:
_check_param(engine, param_name, [1.5] * 5) _check_param(engine, param_name, [1.5] * 5)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment