Commit e445bd91 authored by wujl5's avatar wujl5 Committed by zhuwenwen
Browse files

fix: 细化量化模型开启融合场景

parent c0e0e7cd
...@@ -33,18 +33,8 @@ from vllm.platforms import current_platform ...@@ -33,18 +33,8 @@ from vllm.platforms import current_platform
import os import os
from vllm.model_executor.utils import gemm_bank_conf from vllm.model_executor.utils import gemm_bank_conf
from lmslim.quantize.quant_ops import lm_faster_rmsquant
if envs.USE_FUSED_RMS_QUANT: from lmslim.quantize.quant_ops import lm_fuse_silu_mul_quant
try:
from lmslim.quantize.quant_ops import lm_faster_rmsquant
except Exception as e:
print(f"Error: Import fused rmsquant error: {e}")
if envs.USE_FUSED_SILU_MUL_QUANT:
try:
# from lightop import fuse_silu_mul_quant
from lmslim.quantize.quant_ops import lm_fuse_silu_mul_quant
except Exception as e:
print(f"Error: Import fused silu_mul_qunat error: {e}")
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -261,7 +261,7 @@ def get_model_architecture( ...@@ -261,7 +261,7 @@ def get_model_architecture(
# os.environ['VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT'] = '1' # os.environ['VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT'] = '1'
if not envs.is_set("VLLM_SCHED_ENABLE_MINIMAL_INJECTION"): if not envs.is_set("VLLM_SCHED_ENABLE_MINIMAL_INJECTION"):
os.environ['VLLM_SCHED_ENABLE_MINIMAL_INJECTION'] = '1' os.environ['VLLM_SCHED_ENABLE_MINIMAL_INJECTION'] = '1'
if model_config.quantization is not None: if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
if not envs.is_set("USE_FUSED_RMS_QUANT"): if not envs.is_set("USE_FUSED_RMS_QUANT"):
os.environ['USE_FUSED_RMS_QUANT'] = '1' os.environ['USE_FUSED_RMS_QUANT'] = '1'
if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"): if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
...@@ -306,7 +306,8 @@ def get_model_architecture( ...@@ -306,7 +306,8 @@ def get_model_architecture(
# os.environ['VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT'] = '1' # os.environ['VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT'] = '1'
if not envs.is_set("VLLM_SCHED_ENABLE_MINIMAL_INJECTION"): if not envs.is_set("VLLM_SCHED_ENABLE_MINIMAL_INJECTION"):
os.environ['VLLM_SCHED_ENABLE_MINIMAL_INJECTION'] = '1' os.environ['VLLM_SCHED_ENABLE_MINIMAL_INJECTION'] = '1'
if model_config.quantization is not None: if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
if not envs.is_set("USE_FUSED_RMS_QUANT"): if not envs.is_set("USE_FUSED_RMS_QUANT"):
os.environ['USE_FUSED_RMS_QUANT'] = '1' os.environ['USE_FUSED_RMS_QUANT'] = '1'
if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"): if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment