[Bugfix] Fix incorrect vocal embedding shards for GGUF model in tensor parallelism (#7954)

d78789ac · Isotr0py · GitHub · c334b189 · d78789ac
Unverified Commit d78789ac authored Aug 30, 2024 by Isotr0py Committed by GitHub Aug 29, 2024
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/model_executor/layers/vocab_parallel_embedding.py vllm/model_executor/layers/vocab_parallel_embedding.py +4 -1

No files found.
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -351,7 +351,10 @@ class VocabParallelEmbedding(torch.nn.Module):
            param.weight_type = loaded_weight.item()
            return
        elif isinstance(param, UninitializedParameter):
-            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+            shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                shape[output_dim] = shape[output_dim] // self.tp_size
+            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
        # If parameter does not have output dim, then it should
        # be copied onto all gpus (e.g. g_idx for act_order gptq).