fix weigit loading for GQA with TP (#2379)

f780504d · Chenhui Zhang · GitHub · bfc072ad · f780504d
Unverified Commit f780504d authored Jan 16, 2024 by Chenhui Zhang Committed by GitHub Jan 15, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/model_executor/layers/linear.py vllm/model_executor/layers/linear.py +4 -1

No files found.
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -423,7 +423,10 @@ class QKVParallelLinear(ColumnParallelLinear):
                shard_offset = shard_offset // param.pack_factor
            param_data = param_data.narrow(output_dim, shard_offset,
                                           shard_size)
-            shard_id = tp_rank // self.num_kv_head_replicas
+            if loaded_shard_id == "q":
+                shard_id = tp_rank
+            else:
+                shard_id = tp_rank // self.num_kv_head_replicas
            start_idx = shard_id * shard_size
            loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                 shard_size)