Unverified Commit d48f4d6d authored by Andrew Sansom's avatar Andrew Sansom Committed by GitHub
Browse files

perf: Avoid copying inputs_embeds tensors to GPU unless prompt_embeds is enabled (#25739)


Signed-off-by: default avatarAndrew Sansom <andrew@protopia.ai>
parent e84e0735
......@@ -836,6 +836,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
if self.input_batch.prev_sampled_token_ids is None:
# Normal scheduling case
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
return
......@@ -863,6 +864,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# If not all requests are decodes from the last iteration,
# We need to copy the input_ids_cpu to the GPU first.
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0:
......@@ -878,6 +880,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
0],
non_blocking=True)
if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True
return
# Upload the index tensors asynchronously
......@@ -978,6 +981,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
0,
token_indices_tensor,
out=self.input_ids.cpu[:total_num_scheduled_tokens])
if self.enable_prompt_embeds:
is_token_ids = self.input_batch.is_token_ids.flatten()
torch.index_select(
is_token_ids,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment