modify gemm pad strategy

7fe40ced · zhuwenwen · e661266e · 7fe40ced · 7fe40ced
Commit 7fe40ced authored Jul 20, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 12 deletions

vllm/model_executor/layers/linear.py vllm/model_executor/layers/linear.py +12 -10

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +2 -2

No files found.
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -867,17 +867,19 @@ class RowParallelLinear(LinearBase):

        assert param_data.shape == loaded_weight.shape
        
-        # if self.use_llama_nn:
-        #     loaded_weight = loaded_weight.transpose(0, 1)
-        #     loaded_weight=loaded_weight.reshape(param_data.shape[0],-1)
-        # param_data.copy_(loaded_weight)
-        
-        param_data.copy_(loaded_weight)
        if self.use_llama_nn:
-            if gemm_bank_conf(param.data.shape[0]) and self.use_gemm_pad:
-                param.data = pad_weight(param.data, 32)  
-            param.data = param.data.transpose(0, 1) 
-            param.data=param.data.reshape(param.data.shape[1],-1)
+            if not self.use_gemm_pad:
+                loaded_weight = loaded_weight.transpose(0, 1)
+                loaded_weight=loaded_weight.reshape(param_data.shape[0],-1)
+                param_data.copy_(loaded_weight)
+            else:
+                param_data.copy_(loaded_weight)
+                if gemm_bank_conf(param.data.shape[0]) and self.use_gemm_pad:
+                    param.data = pad_weight(param.data, 32)   
+                param.data = param.data.transpose(0, 1) 
+                param.data=param.data.reshape(param.data.shape[1],-1)
+        else:
+            param_data.copy_(loaded_weight)

    def forward(self, input_):
        # Set up backprop all-reduce.

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -25,8 +25,8 @@ def get_model_architecture(
    if architectures == ['LlamaForCausalLM'] or architectures == ['Qwen2ForCausalLM'] or architectures == ['ChatGLMModel'] or architectures == ['BaichuanForCausalLM']:
        if os.getenv('LLAMA_NN') != '0': 
            os.environ['LLAMA_NN'] = '1'
-        if os.getenv('GEMM_PAD') != '0': 
-            os.environ['GEMM_PAD'] = '1'
+        if os.getenv('GEMM_PAD') != '1': 
+            os.environ['GEMM_PAD'] = '0'
        if os.getenv('FA_PAD') != '1': 
            os.environ['FA_PAD'] = '0'
    else: