Commit d04137d6 authored by 王敏's avatar 王敏
Browse files

[BUGFIX]修复w4a8低延迟报错

parent d5538a81
...@@ -599,6 +599,7 @@ def int8_w4a8_moe_quant_config( ...@@ -599,6 +599,7 @@ def int8_w4a8_moe_quant_config(
per_act_token_quant=per_act_token_quant, per_act_token_quant=per_act_token_quant,
per_out_ch_quant=False, per_out_ch_quant=False,
block_shape=block_shape, block_shape=block_shape,
weight_dtype="int4"
) )
......
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
import torch import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from vllm.logger import init_logger
import vllm.envs as envs import vllm.envs as envs
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import get_current_vllm_config from vllm.config import get_current_vllm_config
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment