Commit 8e95b5e2 authored by gaoqiong's avatar gaoqiong
Browse files

Merge branch 'v0.9.2-dev-wm' into 'v0.9.2-dev'

[fix]修复v1 mtp接受率低的问题

See merge request dcutoolkit/deeplearing/vllm!174
parents dea49b15 3dad13fb
...@@ -4106,7 +4106,7 @@ class CompilationConfig: ...@@ -4106,7 +4106,7 @@ class CompilationConfig:
are always used, it can set this to False. Otherwise, it should are always used, it can set this to False. Otherwise, it should
set this to True, and the compiler will copy the input to an set this to True, and the compiler will copy the input to an
internally managed buffer. Default is False.""" internally managed buffer. Default is False."""
full_cuda_graph: bool = True full_cuda_graph: bool = False
"""whether to use a full cuda graph for the entire forward pass rather than """whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs. Thus this splitting certain operations such as attention into subgraphs. Thus this
flag cannot be used together with splitting_ops. This may provide flag cannot be used together with splitting_ops. This may provide
......
...@@ -546,6 +546,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): ...@@ -546,6 +546,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
device = self.runner.device device = self.runner.device
block_table = self.block_table block_table = self.block_table
block_table_tensor = block_table.get_device_tensor()[:num_reqs] block_table_tensor = block_table.get_device_tensor()[:num_reqs]
slot_mapping = common_attn_metadata.slot_mapping
if slot_mapping is None:
block_table.slot_mapping[:num_actual_tokens].copy_( block_table.slot_mapping[:num_actual_tokens].copy_(
block_table.slot_mapping_cpu[:num_actual_tokens], block_table.slot_mapping_cpu[:num_actual_tokens],
non_blocking=True) non_blocking=True)
......
...@@ -45,6 +45,8 @@ class CommonAttentionMetadata: ...@@ -45,6 +45,8 @@ class CommonAttentionMetadata:
"""(batch_size,), record the rejected tokens number in cpu and gpu""" """(batch_size,), record the rejected tokens number in cpu and gpu"""
num_speculative_tokens: int = 0 num_speculative_tokens: int = 0
"""Number of speculative tokens""" """Number of speculative tokens"""
slot_mapping: torch.Tensor = None
"""(batch_size, seq_len), slot mapping"""
M = TypeVar("M") M = TypeVar("M")
......
...@@ -159,6 +159,7 @@ class EagleProposer: ...@@ -159,6 +159,7 @@ class EagleProposer:
num_actual_tokens=num_tokens, num_actual_tokens=num_tokens,
max_query_len=max_query_len, max_query_len=max_query_len,
num_rejected_tokens=num_rejected_tokens, num_rejected_tokens=num_rejected_tokens,
slot_mapping=target_slot_mapping
) )
assert self.runner is not None assert self.runner is not None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment