[Bugfix] awq_gemm: fix argument order swap (#30364)

Signed-off-by: Matthias Gehre <matthias.gehre@amd.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>

[Bugfix] awq_gemm: fix argument order swap (#30364)
Signed-off-by: Matthias Gehre <matthias.gehre@amd.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
e9add129 · Matthias Gehre · GitHub · 3224ea99 · e9add129 · e9add129
Unverified Commit e9add129 authored Dec 14, 2025 by Matthias Gehre Committed by GitHub Dec 14, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 7 deletions

tests/kernels/quantization/test_awq.py tests/kernels/quantization/test_awq.py +3 -3

vllm/_custom_ops.py vllm/_custom_ops.py +4 -4

No files found.
--- a/tests/kernels/quantization/test_awq.py
+++ b/tests/kernels/quantization/test_awq.py
@@ -41,9 +41,9 @@ def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
        qweight = torch.randint(
            -2000000000, 2000000000, (8192, 256), device="cuda", dtype=torch.int32
        )
-        scales = torch.randint(
+        scales = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
+        qzeros = torch.randint(
            -2000000000, 2000000000, (64, 256), device="cuda", dtype=torch.int32
        )
-        qzeros = torch.empty((64, 2048), device="cuda", dtype=torch.float16)
        split_k_iters = 8
-        opcheck(torch.ops._C.awq_gemm, (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm, (input, qweight, scales, qzeros, split_k_iters))
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -498,15 +498,15 @@ def awq_dequantize(
 def awq_gemm(
    input: torch.Tensor,
    qweight: torch.Tensor,
-    qzeros: torch.Tensor,
    scales: torch.Tensor,
+    qzeros: torch.Tensor,
    split_k_iters: int,
 ) -> torch.Tensor:
    if envs.VLLM_USE_TRITON_AWQ:
        from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
-        return awq_gemm_triton(input, qweight, qzeros, scales, split_k_iters)
+        return awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
-    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, scales, qzeros, split_k_iters)
 # gptq
@@ -632,8 +632,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
    def _awq_gemm_fake(
        input: torch.Tensor,
        qweight: torch.Tensor,
-        qzeros: torch.Tensor,
        scales: torch.Tensor,
+        qzeros: torch.Tensor,
        split_k_iters: torch.SymInt,
    ) -> torch.Tensor:
        num_in_feats = input.size(0)