Unverified Commit 25ebed2f authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V1][Minor] Cache np arange to reduce input preparation overhead (#11214)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent d263bd9d
...@@ -118,6 +118,12 @@ class GPUModelRunner: ...@@ -118,6 +118,12 @@ class GPUModelRunner:
dtype=self.dtype, dtype=self.dtype,
device=self.device) device=self.device)
# OPTIMIZATION: Cache the tensors rather than creating them every step.
self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len),
dtype=np.int32)
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
# a faster version of creating a new tensor every time. Thus, we should
# not make any assumptions about the values in these tensors.
self.input_ids_cpu = torch.zeros(self.max_num_tokens, self.input_ids_cpu = torch.zeros(self.max_num_tokens,
dtype=torch.int32, dtype=torch.int32,
device="cpu", device="cpu",
...@@ -269,11 +275,13 @@ class GPUModelRunner: ...@@ -269,11 +275,13 @@ class GPUModelRunner:
# Get request indices. # Get request indices.
# E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
req_indices = np.repeat(np.arange(num_reqs), num_scheduled_tokens) req_indices = np.repeat(self.arange_np[:num_reqs],
num_scheduled_tokens)
# Get batched arange. # Get batched arange.
# E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
arange = np.concatenate([np.arange(n) for n in num_scheduled_tokens]) arange = np.concatenate(
[self.arange_np[:n] for n in num_scheduled_tokens])
# Get positions. # Get positions.
positions_np = self.positions_np[:total_num_scheduled_tokens] positions_np = self.positions_np[:total_num_scheduled_tokens]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment