Commit 971a47d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev

parents 244b534f 74c6e218
...@@ -173,7 +173,7 @@ if TYPE_CHECKING: ...@@ -173,7 +173,7 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
USE_FUSED_RMS_QUANT: bool = False USE_FUSED_RMS_QUANT: bool = False
USE_FUSED_SILU_MUL_QUANT: bool = False USE_FUSED_SILU_MUL_QUANT: bool = True
VLLM_P2P_ASYNC: bool = False VLLM_P2P_ASYNC: bool = False
VLLM_P2P_BUF_TOKENS: int = 30000 VLLM_P2P_BUF_TOKENS: int = 30000
VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False VLLM_SCHED_ENABLE_MINIMAL_INJECTION: bool = False
...@@ -1141,10 +1141,12 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1141,10 +1141,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"USE_FUSED_RMS_QUANT": "USE_FUSED_RMS_QUANT":
lambda: (os.getenv('USE_FUSED_RMS_QUANT', '0').lower() in lambda: (os.getenv('USE_FUSED_RMS_QUANT', '0').lower() in
("true", "1")), ("true", "1")),
# vllm will use silu_mul_quant fused op # vllm will use silu_mul_quant fused op,
"USE_FUSED_SILU_MUL_QUANT": # This variable has a default value of true,
lambda: (os.getenv('USE_FUSED_SILU_MUL_QUANT', '0').lower() in # but it is still controlled by CRQ and RQ.
("true", "1")), "USE_FUSED_SILU_MUL_QUANT":
lambda: bool(int(os.getenv("USE_FUSED_SILU_MUL_QUANT", "1"))),
# vllm pd separation will be used async # vllm pd separation will be used async
"VLLM_P2P_ASYNC": "VLLM_P2P_ASYNC":
lambda: bool(int(os.getenv("VLLM_P2P_ASYNC", "0"))), lambda: bool(int(os.getenv("VLLM_P2P_ASYNC", "0"))),
......
...@@ -1536,9 +1536,18 @@ class RowParallelLinear(LinearBase): ...@@ -1536,9 +1536,18 @@ class RowParallelLinear(LinearBase):
# Only fuse bias add into GEMM for rank 0 (this ensures that # Only fuse bias add into GEMM for rank 0 (this ensures that
# bias will not get added more than once in TP>1 case) # bias will not get added more than once in TP>1 case)
bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
output_parallel = self.quant_method.apply(self, if use_fused_silu_mul_quant:
input_parallel, xq, xs = lm_fuse_silu_mul_quant(input_parallel)
bias=bias_)
silu_quant_args = [xq, xs]
output_parallel = self.quant_method.apply(self,
input_parallel,
bias=bias_,
silu_quant_args=silu_quant_args)
else:
output_parallel = self.quant_method.apply(self,
input_parallel,
bias=bias_)
if self.reduce_results and self.tp_size > 1: if self.reduce_results and self.tp_size > 1:
if envs.VLLM_ENABLE_TBO: if envs.VLLM_ENABLE_TBO:
output = self.tbo_all_reduce(output_parallel) output = self.tbo_all_reduce(output_parallel)
...@@ -1561,7 +1570,7 @@ class RowParallelLinear(LinearBase): ...@@ -1561,7 +1570,7 @@ class RowParallelLinear(LinearBase):
return output return output
return output, resi, xq, xs, output_bias return output, resi, xq, xs, output_bias
else: else: # RQ and Defualt forward
if self.input_is_parallel: if self.input_is_parallel:
input_parallel = input_ input_parallel = input_
else: else:
......
...@@ -167,6 +167,8 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase): ...@@ -167,6 +167,8 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
elif envs.USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT and input_quant_args is not None: elif envs.USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT and input_quant_args is not None:
assert len(input_quant_args) == 2 assert len(input_quant_args) == 2
x_q, x_scale = input_quant_args x_q, x_scale = input_quant_args
elif envs.USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT and silu_quant_args is not None:
x_q, x_scale = silu_quant_args
else: else:
x_q, x_scale = per_token_quant_int8(x) x_q, x_scale = per_token_quant_int8(x)
......
...@@ -109,8 +109,11 @@ class DeepseekV2MLP(nn.Module): ...@@ -109,8 +109,11 @@ class DeepseekV2MLP(nn.Module):
return x, new_resi return x, new_resi
elif envs.USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT and xqxs is not None: elif envs.USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT and xqxs is not None:
gate_up, _ = self.gate_up_proj(x, xqxs=xqxs) gate_up, _ = self.gate_up_proj(x, xqxs=xqxs)
x = self.act_fn(gate_up) if envs.USE_FUSED_SILU_MUL_QUANT:
x, _ = self.down_proj(x) x, _ = self.down_proj(gate_up, use_fused_silu_mul_quant=True)
else:
x = self.act_fn(gate_up)
x, _ = self.down_proj(x)
return x return x
else: else:
gate_up, _ = self.gate_up_proj(x) gate_up, _ = self.gate_up_proj(x)
...@@ -651,7 +654,10 @@ class DeepseekV2MLAAttention(nn.Module): ...@@ -651,7 +654,10 @@ class DeepseekV2MLAAttention(nn.Module):
q = self.q_proj(hidden_states)[0] q = self.q_proj(hidden_states)[0]
kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
[self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) if envs.VLLM_USE_LIGHTOP:
kv_c_normed = self.kv_a_layernorm.forward_cuda_opt(kv_c)
else:
kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
q = q.view(-1, self.num_local_heads, self.qk_head_dim) q = q.view(-1, self.num_local_heads, self.qk_head_dim)
k_pe = k_pe.unsqueeze(1) k_pe = k_pe.unsqueeze(1)
...@@ -926,8 +932,6 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -926,8 +932,6 @@ class DeepseekV2DecoderLayer(nn.Module):
forward_func = self.choose_forward() forward_func = self.choose_forward()
return forward_func(positions=positions, hidden_states=hidden_states, residual=residual ) return forward_func(positions=positions, hidden_states=hidden_states, residual=residual )
@support_torch_compile @support_torch_compile
class DeepseekV2Model(nn.Module): class DeepseekV2Model(nn.Module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment