update linear to matmul

e35a9e99 · zhuwenwen · 7e5aa0c5 · e35a9e99 · e35a9e99
Commit e35a9e99 authored Oct 15, 2025 by zhuwenwen
Showing with 14 additions and 7 deletions

vllm/model_executor/layers/linear.py vllm/model_executor/layers/linear.py +10 -3

vllm/model_executor/layers/vocab_parallel_embedding.py vllm/model_executor/layers/vocab_parallel_embedding.py +4 -4

No files found.
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -268,7 +268,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
        if self.use_llama_nn:
            # if os.environ['GEMM_PAD'] == '1' and gemm_bank_conf(layer.weight.shape[1] - 32):
            #     layer.weight = layer.weight[:,:-32]
            if bias is not None:
                if len(x.shape) == 2: 
                    return torch.addmm(bias, x, layer.weight)
@@ -277,8 +276,16 @@ class UnquantizedLinearMethod(LinearMethodBase):
            else:
                return torch.matmul(x, layer.weight)
        else:
-            if envs.VLLM_USE_NN and x.shape[-1] == layer.weight.shape[0]:
+            # if envs.VLLM_USE_NN and x.shape[-1] == layer.weight.shape[0]:
-                return dispatch_unquantized_gemm()(layer, x, layer.weight.t(), bias)
+            #     return dispatch_unquantized_gemm()(layer, x, layer.weight.t(), bias)
+            if envs.VLLM_USE_NN:
+                if bias is not None:
+                    if len(x.shape) == 2: 
+                        return torch.addmm(bias, x, layer.weight)
+                    else:
+                        return torch.matmul(x, layer.weight) + bias
+                else:
+                    return torch.matmul(x, layer.weight)
            else:
                return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)

--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -75,9 +75,9 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
            else:
                return torch.matmul(x, layer.weight)
        else:
-            if envs.VLLM_USE_NN and x.shape[-1] == layer.weight.shape[0]:
+            # if envs.VLLM_USE_NN and x.shape[-1] == layer.weight.shape[0]:
-                return dispatch_unquantized_gemm()(layer, x, layer.weight.t(), bias)
+            #     return dispatch_unquantized_gemm()(layer, x, layer.weight.t(), bias)
-            else:
+            # else:
            return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)