feat: adjust attn weight loading logic (#1975)

This PR updates `load_attention` to prefer loading specific attention based on the model type. Additionally there were two cases where `TensorParallelColumnLinear.load_multi` was called and this reduces it to a single path

feat: adjust attn weight loading logic (#1975)
This PR updates `load_attention` to prefer loading specific attention based on the model type. Additionally there were two cases where `TensorParallelColumnLinear.load_multi` was called and this reduces it to a single path
cbced7f0 · drbh · GitHub · 612bc483 · cbced7f0
Unverified Commit cbced7f0 authored May 29, 2024 by drbh Committed by GitHub May 29, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 27 deletions

server/text_generation_server/models/custom_modeling/flash_llama_modeling.py ...ion_server/models/custom_modeling/flash_llama_modeling.py +21 -27

No files found.
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -49,37 +49,31 @@ if SYSTEM == "rocm":

 def load_attention(config, prefix, weights):
    bias = config.attention_bias
-    if config.num_attention_heads != config.num_key_value_heads:
-        return TensorParallelColumnLinear.load_multi(
+
+    # if specific model type, load the correct attention
+    if config.model_type == "phi3":
+        return TensorParallelColumnLinear.load_qkv(
            config,
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-            dim=0,
+            prefix=f"{prefix}.qkv_proj",
            weights=weights,
            bias=bias,
        )
-    else:
-        if config.model_type == "baichuan":
-            return TensorParallelColumnLinear.load_qkv(
-                config,
-                prefix=f"{prefix}.W_pack",
-                weights=weights,
-                bias=bias,
-            )
-        elif config.model_type == "phi3":
-            return TensorParallelColumnLinear.load_qkv(
-                config,
-                prefix=f"{prefix}.qkv_proj",
-                weights=weights,
-                bias=bias,
-            )
-        else:
-            return TensorParallelColumnLinear.load_multi(
-                config,
-                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-                dim=0,
-                weights=weights,
-                bias=bias,
-            )
+    elif config.model_type == "baichuan":
+        return TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=f"{prefix}.W_pack",
+            weights=weights,
+            bias=bias,
+        )
+
+    # otherwise, load the default attention based on the number of heads
+    return TensorParallelColumnLinear.load_multi(
+        config,
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+        weights=weights,
+        bias=bias,
+    )


 class FlashLlamaAttention(torch.nn.Module):