Commit 2164aab4 authored by wanglong3's avatar wanglong3
Browse files

更新 vllm/_custom_ops.py, vllm/model_executor/layers/quantization/utils/w8a8_utils.py

parent ade7db0c
......@@ -1150,6 +1150,8 @@ def blaslt_scaled_mm(a: torch.Tensor,
n = b.shape[0]
k = a.shape[1]
_, out = quant_ops.hipblaslt_w8a8_gemm(a, b, scale_a, scale_b, m, n, k, 'NT', out_dtype)
if bias is not None:
out += bias
return out
def triton_scaled_mm(a: torch.Tensor,
......
......@@ -504,7 +504,7 @@ def apply_int8_linear(
scale_a=x_scale,
scale_b=weight_scale,
out_dtype=input.dtype,
bias=None)
bias=bias)
else:
return ops.rocblas_scaled_mm(
x_q,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment