Unverified Commit f23b4c04 authored by Varun Sundar Rabindranath's avatar Varun Sundar Rabindranath Committed by GitHub
Browse files

[BugFix] Pad input buffers in _dummy_run (#26209)


Signed-off-by: default avatarVarun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: default avatarVarun Sundar Rabindranath <vsundarr@redhat.com>
parent 93540958
...@@ -3434,26 +3434,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -3434,26 +3434,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
with self.maybe_dummy_run_with_lora( with self.maybe_dummy_run_with_lora(
self.lora_config, num_scheduled_tokens, remove_lora self.lora_config, num_scheduled_tokens, remove_lora
): ):
model_kwargs = self._init_model_kwargs(num_tokens) # Make sure padding doesn't exceed max_num_tokens
assert num_tokens_after_padding <= self.max_num_tokens
model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
input_ids = None input_ids = None
inputs_embeds = self.inputs_embeds.gpu[:num_tokens] inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
model_kwargs = { model_kwargs = {
**model_kwargs, **model_kwargs,
**self._dummy_mm_kwargs(num_reqs), **self._dummy_mm_kwargs(num_reqs),
} }
elif self.enable_prompt_embeds: elif self.enable_prompt_embeds:
input_ids = None input_ids = None
inputs_embeds = self.inputs_embeds.gpu[:num_tokens] inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
model_kwargs = self._init_model_kwargs(num_tokens) model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
else: else:
input_ids = self.input_ids.gpu[:num_tokens] input_ids = self.input_ids.gpu[:num_tokens_after_padding]
inputs_embeds = None inputs_embeds = None
if self.uses_mrope: if self.uses_mrope:
positions = self.mrope_positions.gpu[:, :num_tokens] positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
else: else:
positions = self.positions.gpu[:num_tokens] positions = self.positions.gpu[:num_tokens_after_padding]
if get_pp_group().is_first_rank: if get_pp_group().is_first_rank:
intermediate_tensors = None intermediate_tensors = None
...@@ -3468,7 +3470,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -3468,7 +3470,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) )
intermediate_tensors = self.sync_and_slice_intermediate_tensors( intermediate_tensors = self.sync_and_slice_intermediate_tensors(
num_tokens, None, False num_tokens_after_padding, None, False
) )
# filter out the valid batch descriptor # filter out the valid batch descriptor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment