Commit 4080ac85 authored by zhuwenwen's avatar zhuwenwen
Browse files

update flash_attn.py

parent 64fc5a29
......@@ -709,7 +709,7 @@ class FlashAttentionImpl(AttentionImpl):
out=output[:num_actual_tokens],
cu_seqlens_q=cu_seqlens_q,
max_seqlen_q=max_seqlen_q,
seqused_k=seqused_k,
seqused_k=seqused_k,
max_seqlen_k=max_seqlen_k,
softmax_scale=self.scale,
causal=True,
......@@ -717,7 +717,11 @@ class FlashAttentionImpl(AttentionImpl):
window_size=self.sliding_window,
block_table=block_table,
softcap=self.logits_soft_cap,
# scheduler_metadata=scheduler_metadata,
scheduler_metadata=scheduler_metadata,
# fa_version=self.vllm_flash_attn_version,
# q_descale=layer._q_scale.expand(descale_shape),
# k_descale=layer._k_scale.expand(descale_shape),
# v_descale=layer._v_scale.expand(descale_shape),
)
return output
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment