Commit 2805c93e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds

parents c8de4a43 1b98d0bb
...@@ -19,6 +19,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention ...@@ -19,6 +19,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
| Qwen3MoeForCausalLM | QWen3MoE | Yes | - | - | v0.8.4 | Yes | | Qwen3MoeForCausalLM | QWen3MoE | Yes | - | - | v0.8.4 | Yes |
| ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | No | Yes | v0.5.0 | Yes | | ChatGLMModel | glm-4v-9b,chatglm3,chatglm2 | Yes | No | Yes | v0.5.0 | Yes |
| Glm4ForCausalLM | GLM-4-0414 | No/Yes | - | - | v0.8.5.post1 | Yes | | Glm4ForCausalLM | GLM-4-0414 | No/Yes | - | - | v0.8.5.post1 | Yes |
| Glm4MoeForCausalLM | GLM-4.5,GLM-4.5-Air | No/Yes | - | - | v0.9.2 | Yes |
| DeepseekForCausalLM | Deepseek | Yes | No | - | v0.5.0 | Yes | | DeepseekForCausalLM | Deepseek | Yes | No | - | v0.5.0 | Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | No | - | v0.6.2 | Yes | | DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | No | - | v0.6.2 | Yes |
| DeepseekVLV2ForCausalLM | DeepSeek-VL2 | Yes | No | - | v0.7.2 | Yes | | DeepseekVLV2ForCausalLM | DeepSeek-VL2 | Yes | No | - | v0.7.2 | Yes |
......
...@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
if sha is None: if sha is None:
sha = get_sha(vllm_root) sha = get_sha(vllm_root)
if (major, minor) >= ('2', '5'): if (major, minor) >= ('2', '5'):
version = 'das.opt1.rc2.' + sha[:7] version = 'das.opt1.' + sha[:7]
else: else:
if (major, minor) >= ('2', '5'): if (major, minor) >= ('2', '5'):
version = 'das.opt1.rc2' version = 'das.opt1'
# dtk version # dtk version
......
...@@ -418,6 +418,9 @@ class ModelConfig: ...@@ -418,6 +418,9 @@ class ModelConfig:
- "transformers" will use the Transformers model implementation.""" - "transformers" will use the Transformers model implementation."""
override_attention_dtype: Optional[str] = None override_attention_dtype: Optional[str] = None
"""Override dtype for attention""" """Override dtype for attention"""
enable_chunked_prefill: Optional[bool] = None
"""If True, prefill requests can be chunked based
on the remaining max_num_batched_tokens."""
def compute_hash(self) -> str: def compute_hash(self) -> str:
""" """
...@@ -448,6 +451,7 @@ class ModelConfig: ...@@ -448,6 +451,7 @@ class ModelConfig:
factors.append(self.rope_theta) factors.append(self.rope_theta)
# hf_config can control how the model looks! # hf_config can control how the model looks!
factors.append(self.hf_config.to_json_string()) factors.append(self.hf_config.to_json_string())
factors.append(self.enable_chunked_prefill)
str_factors = str(factors) str_factors = str(factors)
assert_hashable(str_factors) assert_hashable(str_factors)
return hashlib.sha256(str(factors).encode()).hexdigest() return hashlib.sha256(str(factors).encode()).hexdigest()
......
...@@ -1004,6 +1004,7 @@ class EngineArgs: ...@@ -1004,6 +1004,7 @@ class EngineArgs:
enable_sleep_mode=self.enable_sleep_mode, enable_sleep_mode=self.enable_sleep_mode,
model_impl=self.model_impl, model_impl=self.model_impl,
override_attention_dtype=self.override_attention_dtype, override_attention_dtype=self.override_attention_dtype,
enable_chunked_prefill=self.enable_chunked_prefill,
) )
def create_load_config(self) -> LoadConfig: def create_load_config(self) -> LoadConfig:
...@@ -1593,6 +1594,9 @@ class EngineArgs: ...@@ -1593,6 +1594,9 @@ class EngineArgs:
# For pooling tasks the default is False # For pooling tasks the default is False
if model_config.runner_type != "pooling": if model_config.runner_type != "pooling":
self.enable_chunked_prefill = True self.enable_chunked_prefill = True
if model_config.enable_chunked_prefill is not None and \
model_config.enable_chunked_prefill is False:
self.enable_chunked_prefill = False
if self.enable_prefix_caching is None: if self.enable_prefix_caching is None:
self.enable_prefix_caching = True self.enable_prefix_caching = True
else: else:
...@@ -1606,6 +1610,10 @@ class EngineArgs: ...@@ -1606,6 +1610,10 @@ class EngineArgs:
action = "Enabling" if \ action = "Enabling" if \
incremental_prefill_supported else "Disabling" incremental_prefill_supported else "Disabling"
if model_config.enable_chunked_prefill is not None and \
model_config.enable_chunked_prefill is False:
self.enable_chunked_prefill = False
if self.enable_chunked_prefill is None: if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = incremental_prefill_supported self.enable_chunked_prefill = incremental_prefill_supported
......
...@@ -1113,11 +1113,11 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1113,11 +1113,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
("true", "1")), ("true", "1")),
# vLLM will use lightop moe_sum # vLLM will use lightop moe_sum
"VLLM_USE_LIGHTOP_MOE_SUM": "VLLM_USE_LIGHTOP_MOE_SUM":
lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_SUM", "False").lower() in lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_SUM", "True").lower() in
("true", "1")), ("true", "1")),
# vLLM will use lightop moe_align_block_size # vLLM will use lightop moe_align_block_size
"VLLM_USE_LIGHTOP_MOE_ALIGN": "VLLM_USE_LIGHTOP_MOE_ALIGN":
lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_ALIGN", "False").lower() in lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_ALIGN", "True").lower() in
("true", "1")), ("true", "1")),
# vLLM will use opt merge_aatn_states, not triton # vLLM will use opt merge_aatn_states, not triton
"VLLM_USE_MERGE_ATTN_STATES_OPT": "VLLM_USE_MERGE_ATTN_STATES_OPT":
......
...@@ -251,6 +251,23 @@ class DeepseekV2MoE(nn.Module): ...@@ -251,6 +251,23 @@ class DeepseekV2MoE(nn.Module):
else: else:
final_hidden_states = self.experts(hidden_states=hidden_states, final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits) router_logits=router_logits)
if shared_output is not None:
if hidden_states.dtype != torch.float16:
final_hidden_states = final_hidden_states + shared_output
else:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states = final_hidden_states + shared_output \
* (1. / self.routed_scaling_factor)
if self.tp_size > 1:
if envs.VLLM_ENABLE_TBO:
final_hidden_states = self.tbo_all_reduce(final_hidden_states)
else:
final_hidden_states = (
self.experts.maybe_all_reduce_tensor_model_parallel(
final_hidden_states))
if not self.use_mori_ep: if not self.use_mori_ep:
if self.tp_size > 1: if self.tp_size > 1:
...@@ -721,7 +738,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -721,7 +738,7 @@ class DeepseekV2DecoderLayer(nn.Module):
residual = residual residual = residual
) )
residual = new_residual residual = new_residual
if hidden_states.dtype == torch.float16: if hidden_states.dtype == torch.float16:
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor hidden_states *= 1. / self.routed_scaling_factor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment