[Bugfix] Fix QKVParallelLinearWithShardedLora bias bug (#10844)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[Bugfix] Fix QKVParallelLinearWithShardedLora bias bug (#10844)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
a4cf2561 · Jee Jee Li · GitHub · d746268e · a4cf2561 · a4cf2561
Unverified Commit a4cf2561 authored Dec 03, 2024 by Jee Jee Li Committed by GitHub Dec 03, 2024
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 9 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +0 -1

vllm/lora/fully_sharded_layers.py vllm/lora/fully_sharded_layers.py +1 -8

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -481,7 +481,6 @@ steps:

 - label: LoRA TP Test (Distributed)
  num_gpus: 4
-  soft_fail: true
  source_file_dependencies:
  - vllm/lora
  - tests/lora

--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -77,13 +77,6 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
                                       add_input=True)
        # now have column partitioned output

-        if self.bias_stacked is not None:
-            self.bias_stacked = self.bias_stacked.view(
-                -1, self.bias_stacked.shape[-1])
-            self.bias_stacked = self.bias_stacked[
-                self.punica_wrapper.token_lora_indices]
-            output += self.bias_stacked
-
        output = output.view(*out_orig_shape)
        return output

@@ -222,7 +215,7 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
        self.punica_wrapper.add_expand(output,
                                       buffer,
                                       self.lora_b_stacked,
-                                       self.bias_all,
+                                       self.bias_stacked,
                                       add_input=True)
        # now have column partitioned output
        output = output.view(*out_orig_shape)