Unverified Commit 0feca02d authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve benchmark scripts (#615)

parent 10143e1a
......@@ -100,11 +100,12 @@ def run_one_batch_size(bs):
with open("results.jsonl", "a") as fout:
res = {
"backend": args.backend,
"input_len": args.input_len,
"output_len": args.max_tokens,
"batch_size": bs,
"latency": latency,
"output_throughput": output_throughput
"output_throughput": output_throughput,
}
fout.write(json.dumps(res) + "\n")
......
......@@ -52,7 +52,7 @@ class TokenToKVPool:
# Prefetch buffer
self.prefetch_buffer = torch.empty(0, device="cuda", dtype=torch.int32)
self.prefetch_chunk_size = 256
self.prefetch_chunk_size = 512
self.clear()
......@@ -67,11 +67,11 @@ class TokenToKVPool:
if need_size <= buffer_len:
select_index = self.prefetch_buffer[:need_size]
self.prefetch_buffer = self.prefetch_buffer[need_size:]
return select_index.to(torch.int32)
return select_index
addition_size = need_size - buffer_len
alloc_size = max(addition_size, self.prefetch_chunk_size)
select_index = torch.nonzero(self.mem_state == 0).squeeze(1)[:alloc_size]
select_index = torch.nonzero(self.mem_state == 0).squeeze(1)[:alloc_size].to(torch.int32)
if select_index.shape[0] < addition_size:
return None
......@@ -82,7 +82,7 @@ class TokenToKVPool:
ret_index = self.prefetch_buffer[:need_size]
self.prefetch_buffer = self.prefetch_buffer[need_size:]
return ret_index.to(torch.int32)
return ret_index
def alloc_contiguous(self, need_size):
# NOTE: This function is deprecated.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment