Commit af0e6d8f authored by wangmin6's avatar wangmin6
Browse files

Merge branch 'v0.15.1-dev-wm' into 'v0.15.1-dev'

[perf]消除sparse mla build时的拷贝调度空泡

See merge request dcutoolkit/deeplearing/vllm!510
parents efa6bed2 46ab154b
......@@ -456,6 +456,12 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
dtype=torch.int32,
device=device,
)
self.req_id_per_token_buffer_cpu = torch.zeros((vllm_config.scheduler_config.max_num_batched_tokens,),
dtype=torch.int32,
device="cpu",
pin_memory=True)
self.req_id_per_token_buffer_np = self.req_id_per_token_buffer_cpu.numpy()
def _build_fp8_mixed_decode_prefill(
self,
......@@ -651,9 +657,10 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad
)
# Zero-fill for cudagraphs
self.req_id_per_token_buffer.fill_(0)
self.req_id_per_token_buffer_np[: req_id_per_token.shape[0]] = req_id_per_token
self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
torch.from_numpy(req_id_per_token), non_blocking=True
)
self.req_id_per_token_buffer_cpu[: req_id_per_token.shape[0]], non_blocking=True)
req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
fp8_extra_metadata: (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment