Commit 8e95b5e2 authored by gaoqiong's avatar gaoqiong
Browse files

Merge branch 'v0.9.2-dev-wm' into 'v0.9.2-dev'

[fix]修复v1 mtp接受率低的问题

See merge request dcutoolkit/deeplearing/vllm!174
parents dea49b15 3dad13fb
......@@ -4106,7 +4106,7 @@ class CompilationConfig:
are always used, it can set this to False. Otherwise, it should
set this to True, and the compiler will copy the input to an
internally managed buffer. Default is False."""
full_cuda_graph: bool = True
full_cuda_graph: bool = False
"""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs. Thus this
flag cannot be used together with splitting_ops. This may provide
......
......@@ -546,6 +546,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
device = self.runner.device
block_table = self.block_table
block_table_tensor = block_table.get_device_tensor()[:num_reqs]
slot_mapping = common_attn_metadata.slot_mapping
if slot_mapping is None:
block_table.slot_mapping[:num_actual_tokens].copy_(
block_table.slot_mapping_cpu[:num_actual_tokens],
non_blocking=True)
......
......@@ -45,6 +45,8 @@ class CommonAttentionMetadata:
"""(batch_size,), record the rejected tokens number in cpu and gpu"""
num_speculative_tokens: int = 0
"""Number of speculative tokens"""
slot_mapping: torch.Tensor = None
"""(batch_size, seq_len), slot mapping"""
M = TypeVar("M")
......
......@@ -159,6 +159,7 @@ class EagleProposer:
num_actual_tokens=num_tokens,
max_query_len=max_query_len,
num_rejected_tokens=num_rejected_tokens,
slot_mapping=target_slot_mapping
)
assert self.runner is not None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment