Unverified Commit 3295cd8a authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Allow skipping warmup in bench_offline_throughput.py (#2103)

parent 5942dfc0
......@@ -57,6 +57,7 @@ class BenchArgs:
disable_ignore_eos: bool = False
extra_request_body: Optional[str] = None
seed: int = 1
skip_warmup: bool = False
do_not_exit: bool = False
@staticmethod
......@@ -152,6 +153,11 @@ class BenchArgs:
"additional generate params like sampling params.",
)
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
parser.add_argument(
"--skip-warmup",
action="store_true",
help="Skip the warmup batches.",
)
parser.add_argument(
"--do-not-exit",
action="store_true",
......@@ -261,6 +267,7 @@ def throughput_test(
)
# Warm up
if not bench_args.skip_warmup:
logging.info("\nWarmup...")
throughput_test_once(
backend_name=bench_args.backend,
......
......@@ -156,9 +156,6 @@ class TpModelWorkerClient:
return logits_output, next_token_ids
def forward_batch_generation(self, model_worker_batch: ModelWorkerBatch):
# A cuda stream sync here to avoid the cuda illegal memory access error.
torch.cuda.current_stream().synchronize()
# Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
sampling_info = model_worker_batch.sampling_info
sampling_info.update_penalties()
......@@ -169,6 +166,9 @@ class TpModelWorkerClient:
linear_penalties=sampling_info.linear_penalties,
)
# A cuda stream sync here to avoid the cuda illegal memory access error.
torch.cuda.current_stream().synchronize()
# Push a new batch to the queue
self.input_queue.put((model_worker_batch, self.future_token_ids_ct))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment