auto convert lm_head layout of llama

1428c17d · zhuwenwen · 85def94c · 1428c17d · 1428c17d
Commit 1428c17d authored Oct 30, 2024 by zhuwenwen
Showing with 7 additions and 5 deletions

vllm/model_executor/layers/vocab_parallel_embedding.py vllm/model_executor/layers/vocab_parallel_embedding.py +1 -2

vllm/model_executor/models/llama.py vllm/model_executor/models/llama.py +6 -3

No files found.
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -22,7 +22,6 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
    def __init__(self):
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
-        self.use_lm_nn = os.environ.get('LM_NN') == '1'
    def create_weights(self, layer: torch.nn.Module,
                       input_size_per_partition: int,
@@ -42,7 +41,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if self.use_llama_nn and self.use_lm_nn:
+        if self.use_llama_nn and os.environ['LM_NN'] == '1':
            if bias is not None:
                if len(x.shape) == 2: 
                    return torch.addmm(bias, x, layer.weight)

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -455,7 +455,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
            self.quant_config=quant_config
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
-        self.use_lm_nn = os.environ.get('LM_NN') == '1'
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
@@ -574,8 +573,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                "self_attn.qkv_proj.weight",
                "self_attn.o_proj.weight",
                "mlp.gate_up_proj.weight",
-                "mlp.down_proj.weight"
+                "mlp.down_proj.weight",
-                # "lm_head.weight"
+                "lm_head.weight"
            ]
            if self.use_lm_nn:
@@ -587,6 +586,10 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
            qkv_words = "|".join(lay_qkv_words)          
            for layername, weight in params_dict.items():
+                if "lm_head.weight" in layername:
+                    os.environ['LM_NN'] = '1'  
+                else:
+                    os.environ['LM_NN'] = '0' 
                matches = re.findall(combined_words, layername)
                if matches:         
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):