Commit c1ece8c6 authored by zhuwenwen's avatar zhuwenwen
Browse files

VLLM_USE_LIGHTOP and VLLM_USE_OPT_CAT

add shared_output and routed_scaling_factor of CompressedTensorsW8A8Int8MoEMethod
parent c7d0b817
......@@ -1097,7 +1097,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
shared_output: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if enable_eplb:
raise NotImplementedError(
......@@ -1137,7 +1137,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
w2_scale=layer.w2_weight_scale,
a1_scale=layer.w13_input_scale,
a2_scale=layer.w2_input_scale,
use_nn_moe=False)
use_nn_moe=False,
shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor)
class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
......
......@@ -244,7 +244,7 @@ def get_model_architecture(
else:
os.environ['LM_NN'] = '1'
if (architectures == ['DeepseekV3ForCausalLM'] or architectures == ['DeepSeekMTPModel']):
if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
if not envs.is_set("VLLM_USE_LIGHTOP"):
os.environ['VLLM_USE_LIGHTOP'] = '1'
if not envs.is_set("VLLM_USE_OPT_CAT"):
......@@ -254,6 +254,12 @@ def get_model_architecture(
os.environ['GEMM_PAD'] = '0'
if os.getenv('FA_PAD') != '1':
os.environ['FA_PAD'] = '0'
else:
if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
if not envs.is_set("VLLM_USE_LIGHTOP"):
os.environ['VLLM_USE_LIGHTOP'] = '1'
if not envs.is_set("VLLM_USE_OPT_CAT"):
os.environ['VLLM_USE_OPT_CAT'] = '1'
# awq相关配置
try:
if os.getenv('AWQ_MOE_SZ') == None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment