update paged_attn.py

85e8224c · zhuwenwen · b3ab1cdc · 85e8224c · 85e8224c
Commit 85e8224c authored Oct 16, 2024 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

vllm/attention/ops/paged_attn.py vllm/attention/ops/paged_attn.py +1 -1

vllm/model_executor/models/baichuan.py vllm/model_executor/models/baichuan.py +1 -0

No files found.
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -201,7 +201,7 @@ class PagedAttention:
                print(f"query.shape = {query.shape}, key_cache.shape = {key_cache.shape}, value_cache.shape = {value_cache.shape}")
                print(f"num_kv_heads = {num_kv_heads}, scale = {scale:.3f}, block_tables.shape = {block_tables.shape}, seq_lens.shape = {seq_lens.shape}, block_size = {block_size}, max_seq_len = {max_seq_len}")

-            if envs.VLLM_USE_OPT_OP:
+            if envs.VLLM_USE_OPT_OP and max_seq_len<8192:
                ops.paged_attention_v2_opt(
                    output,
                    exp_sums,

--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -358,6 +358,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
+        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'

    def forward(
        self,