"vscode:/vscode.git/clone" did not exist on "37ee5700e97c4c820dc2590e2fe9d927f97a3d96"
Commit 8e22ded2 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip lmslim import when infer non quantized models

parent 492c5dea
...@@ -13,13 +13,14 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 ...@@ -13,13 +13,14 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape) GroupShape)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import direct_register_custom_op, W8a8GetCacheJSON
from vllm.utils import W8a8GetCacheJSON
from lmslim.layers.gemm.int8_utils import per_token_quant_int8
from vllm.utils import direct_register_custom_op
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer
try:
from lmslim.layers.gemm.int8_utils import per_token_quant_int8
except Exception:
print("INFO: Please install lmslim if you want to infer the quantitative model of moe.\n")
# Input scaling factors are no longer optional in _scaled_mm starting # Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
TORCH_DEVICE_IDENTITY = None TORCH_DEVICE_IDENTITY = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment