Commit 8e22ded2 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip lmslim import when infer non quantized models

parent 492c5dea
......@@ -13,13 +13,14 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape)
from vllm.platforms import current_platform
from vllm.utils import W8a8GetCacheJSON
from lmslim.layers.gemm.int8_utils import per_token_quant_int8
from vllm.utils import direct_register_custom_op
from vllm.utils import direct_register_custom_op, W8a8GetCacheJSON
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer
try:
from lmslim.layers.gemm.int8_utils import per_token_quant_int8
except Exception:
print("INFO: Please install lmslim if you want to infer the quantitative model of moe.\n")
# Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
TORCH_DEVICE_IDENTITY = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment