Unverified Commit b46e4a06 authored by Jialin Ouyang's avatar Jialin Ouyang Committed by GitHub
Browse files

[Core][Bookkeeping Optimization] Update against numpy view of is_token_ids tensor (#27618)


Signed-off-by: default avatarJialin Ouyang <Jialin.Ouyang@gmail.com>
parent d34f5fe9
...@@ -108,9 +108,10 @@ class InputBatch: ...@@ -108,9 +108,10 @@ class InputBatch:
pin_memory=False, pin_memory=False,
) )
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
self.is_token_ids = torch.zeros( self.is_token_ids_tensor = torch.zeros(
(max_num_reqs, max_model_len), device="cpu", dtype=bool, pin_memory=False (max_num_reqs, max_model_len), device="cpu", dtype=bool, pin_memory=False
) )
self.is_token_ids = self.is_token_ids_tensor.numpy()
# Store prompt embeddings per request to avoid OOM from large upfront # Store prompt embeddings per request to avoid OOM from large upfront
# allocation if max_model_len is big. # allocation if max_model_len is big.
# Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size) # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
......
...@@ -1103,7 +1103,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1103,7 +1103,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
out=self.input_ids.cpu[:total_num_scheduled_tokens], out=self.input_ids.cpu[:total_num_scheduled_tokens],
) )
if self.enable_prompt_embeds: if self.enable_prompt_embeds:
is_token_ids = self.input_batch.is_token_ids.flatten() is_token_ids = self.input_batch.is_token_ids_tensor.flatten()
torch.index_select( torch.index_select(
is_token_ids, is_token_ids,
0, 0,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment