feat: adjust attn weight loading logic (#1975)

This PR updates `load_attention` to prefer loading specific attention based on the model type. Additionally there were two cases where `TensorParallelColumnLinear.load_multi` was called and this reduces it to a single path

feat: adjust attn weight loading logic (#1975)
This PR updates `load_attention` to prefer loading specific attention based on the model type. Additionally there were two cases where `TensorParallelColumnLinear.load_multi` was called and this reduces it to a single path
cbced7f0 · drbh · GitHub · 612bc483 · cbced7f0
Unverified Commit cbced7f0 authored May 29, 2024 by drbh Committed by GitHub May 29, 2024
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 27 deletions

server/text_generation_server/models/custom_modeling/flash_llama_modeling.py ...ion_server/models/custom_modeling/flash_llama_modeling.py +21 -27

No files found.
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -49,30 +49,24 @@ if SYSTEM == "rocm":
 def load_attention(config, prefix, weights):
    bias = config.attention_bias
-    if config.num_attention_heads != config.num_key_value_heads:
-        return TensorParallelColumnLinear.load_multi(
+    # if specific model type, load the correct attention
-            config,
+    if config.model_type == "phi3":
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-            dim=0,
-            weights=weights,
-            bias=bias,
-        )
-    else:
-        if config.model_type == "baichuan":
        return TensorParallelColumnLinear.load_qkv(
            config,
-                prefix=f"{prefix}.W_pack",
+            prefix=f"{prefix}.qkv_proj",
            weights=weights,
            bias=bias,
        )
-        elif config.model_type == "phi3":
+    elif config.model_type == "baichuan":
        return TensorParallelColumnLinear.load_qkv(
            config,
-                prefix=f"{prefix}.qkv_proj",
+            prefix=f"{prefix}.W_pack",
            weights=weights,
            bias=bias,
        )
-        else:
+    # otherwise, load the default attention based on the number of heads
    return TensorParallelColumnLinear.load_multi(
        config,
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],