update linear apply

7e5aa0c5 · zhuwenwen · b35a518a · 7e5aa0c5 · 7e5aa0c5 · 7e5aa0c5
Commit 7e5aa0c5 authored Oct 14, 2025 by zhuwenwen
3 changed files
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -266,8 +266,8 @@ class UnquantizedLinearMethod(LinearMethodBase):
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        if self.use_llama_nn:
-            if gemm_bank_conf(layer.weight.shape[1] - 32) and os.environ['GEMM_PAD'] == '1':
+            # if os.environ['GEMM_PAD'] == '1' and gemm_bank_conf(layer.weight.shape[1] - 32):
-                layer.weight = layer.weight[:,:-32]
+            #     layer.weight = layer.weight[:,:-32]
            if bias is not None:
                if len(x.shape) == 2: 
@@ -278,9 +278,9 @@ class UnquantizedLinearMethod(LinearMethodBase):
                return torch.matmul(x, layer.weight)
        else:
            if envs.VLLM_USE_NN and x.shape[-1] == layer.weight.shape[0]:
-                return dispatch_unquantized_gemm()(x, layer.weight.t(), bias)
+                return dispatch_unquantized_gemm()(layer, x, layer.weight.t(), bias)
            else:
-                return dispatch_unquantized_gemm()(x, layer.weight, bias)
+                return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 class LinearBase(CustomOp):

--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -187,10 +187,10 @@ def cpu_unquantized_gemm(layer: torch.nn.Module,
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
-    if current_platform.is_rocm():
+    # if current_platform.is_rocm():
        # return rocm_unquantized_gemm
-        return torch.nn.functional.linear
+        # return torch.nn.functional.linear
-    elif current_platform.is_cpu():
+    if current_platform.is_cpu():
        return cpu_unquantized_gemm
    else:
        return default_unquantized_gemm
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -76,9 +76,9 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
                return torch.matmul(x, layer.weight)
        else:
            if envs.VLLM_USE_NN and x.shape[-1] == layer.weight.shape[0]:
-                return dispatch_unquantized_gemm()(x, layer.weight.t(), bias)
+                return dispatch_unquantized_gemm()(layer, x, layer.weight.t(), bias)
            else:
-                return dispatch_unquantized_gemm()(x, layer.weight, bias)
+                return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
    def embedding(self, layer: torch.nn.Module,