更新 vllm/_custom_ops.py, vllm/model_executor/layers/quantization/utils/w8a8_utils.py

2164aab4 · wanglong3 · ade7db0c · 2164aab4 · 2164aab4
Commit 2164aab4 authored Nov 26, 2025 by wanglong3
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

vllm/_custom_ops.py vllm/_custom_ops.py +3 -1

vllm/model_executor/layers/quantization/utils/w8a8_utils.py vllm/model_executor/layers/quantization/utils/w8a8_utils.py +1 -1

No files found.
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1150,6 +1150,8 @@ def blaslt_scaled_mm(a: torch.Tensor,
    n = b.shape[0]
    k = a.shape[1]
    _, out = quant_ops.hipblaslt_w8a8_gemm(a, b, scale_a, scale_b, m, n, k, 'NT', out_dtype)
+    if bias is not None:
+        out += bias
    return out

 def triton_scaled_mm(a: torch.Tensor,
@@ -2486,4 +2488,4 @@ direct_register_custom_op(
    op_func=awq_gemm,
    mutates_args=[],
    fake_impl=awq_gemm_fake,
-)
\ No newline at end of file
+)
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -504,7 +504,7 @@ def apply_int8_linear(
                                    scale_a=x_scale,
                                    scale_b=weight_scale,
                                    out_dtype=input.dtype,
-                                    bias=None)
+                                    bias=bias)
    else:
        return ops.rocblas_scaled_mm(
                x_q,