Unverified Commit 179ae7da authored by aabbccddwasd's avatar aabbccddwasd Committed by GitHub
Browse files

[Revert] Fix performance regression for GLM-4.7-GPTQ decode and MTP acceptance rate (#33771)


Signed-off-by: default avataraabbccddwasd <aabbccddwasd@qq.com>
parent c4df59ad
...@@ -919,9 +919,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ...@@ -919,9 +919,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
# Guard access to seq_lens_cpu, which may not always be needed # Guard access to seq_lens_cpu, which may not always be needed
# and can be expensive to retrieve in async mode. # and can be expensive to retrieve in async mode.
needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode
seq_lens_cpu = ( seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
common_attn_metadata.seq_lens.cpu() if needs_seq_lens_cpu else None
)
seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None
num_blocks_np = ( num_blocks_np = (
(seq_lens_np + (page_size - 1)) // page_size (seq_lens_np + (page_size - 1)) // page_size
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment