Commit 3a306316 authored by liuchy5's avatar liuchy5
Browse files

feat:修改fp8 mqa接口&&跳过VLLM_USE_FUSED_FILL_RMS_CAT&&跳过load_error

parent 8d371e97
...@@ -124,15 +124,15 @@ def sparse_attn_indexer( ...@@ -124,15 +124,15 @@ def sparse_attn_indexer(
) )
logits = op.mqa_logits( logits = op.mqa_logits(
q_fp8[chunk.token_start:chunk.token_end], q_fp8[chunk.token_start:chunk.token_end],
k, k_fp8,
weights[chunk.token_start:chunk.token_end].to(torch.float32), weights[chunk.token_start:chunk.token_end],
chunk.cu_seqlen_ks, chunk.cu_seqlen_ks,
chunk.cu_seqlen_ke, chunk.cu_seqlen_ke,
q_fp8[chunk.token_start:chunk.token_end].shape[0], q_fp8[chunk.token_start:chunk.token_end].shape[0],
k.shape[0], k_fp8.shape[0],
q_fp8.shape[1], q_fp8.shape[1],
q_fp8.shape[2], q_fp8.shape[2],
k_scale, k_scale.view(torch.float32).flatten(),
True True
) )
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment