remove pa

b374a264 · zhuwenwen · c0707728 · b374a264 · b374a264 · b374a264
Commit b374a264 authored Sep 22, 2025 by zhuwenwen
15 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,11 +249,6 @@ set(VLLM_EXT_SRC
  "csrc/layernorm_kernels.cu"
  "csrc/opt/transpose_kernels.cu"
  "csrc/opt/activation_kernels_opt.cu"
-  "csrc/attention/attention_kernels_opt.cu"
-  "csrc/attention/attention_kernels_opt_tc.cu"
-  "csrc/attention/attention_with_mask_kernels.cu"
-  "csrc/attention/attention_with_mask_kernels_opt.cu"
-  "csrc/attention/attention_with_mask_kernels_opt_tc.cu"
  "csrc/opt/layernorm_kernels_opt.cu"
  # "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"

--- a/README.md
+++ b/README.md
 # <div align="center"><strong>vLLM</strong></div>
-## 简介
-vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention高效管理kv内存,Continuous batching传入请求,支持很多Hugging Face模型,如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。
-
-## 暂不支持的官方功能
- **量化推理**:目前不支持marlin的权重量化、kv-cache fp8推理方案
- **模块支持**:目前不支持Sliding window attention
-
-
-## 支持模型结构列表
-
-| 结构 | 模型 | FP16/BF16 | AWQ | GPTQ | 支持版本 | 是否优化 |
-| :------: | :------: | :------: | :------: |:------: | :------: |:------: |
-| LlamaForCausalLM               | Llama 3.2, Llama 3.1,Llama 3,Llama 2,Llama,Yi,Codellama,DeepSeek-R1-Distill-Llama     | Yes | Yes | Yes | v0.5.0，Llama 3.2>=v0.6.2 | Yes |  
-| Llama4ForConditionalGeneration | Llama 4                                                                               | No/Yes | -  | - | v0.8.5.post1  | No |
-| QWenLMHeadModel                | QWen,Qwen-VL                                                                          | Yes | Yes | Yes | v0.5.0，Qwen-VL>=v0.6.2 | Yes |
-| Qwen2ForCausalLM               | QWen2,QWen1.5,CodeQwen1.5,DeepSeek-R1-Distill-Qwen,gte_Qwen2-1.5B-instruct            | Yes | Yes | Yes | v0.5.0，gte>=v0.7.2   | Yes |
-| Qwen3ForCausalLM               | QWen3,Qwen3-Embedding,Qwen3-Reranker                                                  | Yes | - | - | v0.8.4   | Yes |
-| Qwen3MoeForCausalLM            | QWen3MoE                                                    | Yes | - | - | v0.8.4   | Yes |
-| ChatGLMModel                   | glm-4v-9b,chatglm3,chatglm2                                 | Yes | No  | Yes | v0.5.0   | Yes |
-| Glm4ForCausalLM                | GLM-4-0414                                                  | No/Yes | -  | - | v0.8.5.post1   | Yes |
-| DeepseekForCausalLM            | Deepseek                                                    | Yes | No  | -   | v0.5.0  | Yes |
-| DeepseekV2ForCausalLM          | DeepSeek-V2                                                 | Yes | No  | -   | v0.6.2  | Yes |
-| DeepseekVLV2ForCausalLM        | DeepSeek-VL2                                                | Yes | No  | -   | v0.7.2  | Yes |
-| DeepseekV3ForCausalLM          | DeepSeek-V3                                                 | Yes | Yes | -   | v0.7.2  | Yes |
-| BaiChuanForCausalLM            | Baichuan2,Baichuan                                          | Yes | Yes | -   | v0.5.0  | Yes |
-| BloomForCausalLM               | BLOOM                                                       | Yes | No  | Yes | v0.5.0  | Yes |
-| InternLMForCausalLM            | InternLM                                                    | Yes | No  | -   | v0.5.0  | Yes |
-| InternLM2ForCausalLM           | InternLM2                                                   | Yes | No  | -   | v0.5.0  | Yes |
-| FalconForCausalLM              | falcon                                                      | Yes | No  | Yes | v0.5.0  | Yes |
-| TeleChat2ForCausalLM           | TeleChat2                                                   | Yes | No  | -   | v0.7.2  | Yes |
-| MiniCPMForCausalLM             | MiniCPM                                                     | Yes | No  | -   | v0.5.0  | Yes |
-| MiniCPM3ForCausalLM            | MiniCPM3                                                    | Yes | No  | -   | v0.6.2  | Yes |
-| MixtralForCausalLM             | Mixtral-8x7B,Mixtral-8x7B-Instruct                          | Yes | No  | -   | v0.5.0  | Yes |
-| Qwen2MoeForCausalLM                 | Qwen2-57B-A14B,Qwen2-57B-A14B-Instruct        | Yes | No  | -   | v0.5.0   | No |
-| LlavaForConditionalGeneration       | LLaMA,LLaMA-2,LLaMA-3                         | Yes | No  | -   | v0.6.2   | No |
-| Qwen2VLForConditionalGeneration     | Qwen2-VL                                      | Yes | No  | Yes | v0.6.2   | No |
-| Qwen2_5_VLForConditionalGeneration  | Qwen.5-VL                                     | Yes | No  | Yes | v0.7.2   | No |
-| Mistral3ForConditionalGeneration    | Mistral3                                      | Yes | No  | -   | v0.8.5.post1   | No |
-| Gemma3ForConditionalGeneration      | Gemma 3                                       | Yes | -   | -   | v0.8.5.post1   | No |
-| MiniCPMV                            | MiniCPM-V                                     | Yes | No  | -   | v0.6.2  | No |
-| Phi3VForCausalLM                    | Phi-3.5-vision                                | Yes | No  | -   | v0.6.2  | No |
-| BertModel                           | bge-large-zh-v1.5                             | Yes | No  | -   | v0.7.2  | No |
-| XLMRobertaModel                     | bge-m3                                        | Yes | No  | -   | v0.7.2  | No |
-| XLMRobertaForSequenceClassification | bge-reranker-v2-m3                            | Yes | No  | -   | v0.7.2  | No |
-

 ## 安装
 vLLM支持
-+ Python 3.9.
 + Python 3.10.
-+ Python 3.11.
-+ Python 3.12.

 ### 使用源码编译方式安装

 #### 编译环境准备
-提供2种环境准备方式:
-
-1. 基于光源pytorch2.5.1基础镜像环境:根据pytorch2.5.1、python、dtk及系统下载对应的镜像版本。

-2. 基于现有python环境:安装pytorch2.5.1,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.5.1的whl包。安装命令如下:
+基于光源vllm0.9.2基础镜像环境:
 ```shell
-pip install torch* (下载的torch的whl包)
-pip install setuptools wheel
+docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2 
 ```

+镜像除编译环境外，已包含运行vllm需要的如下HCU依赖：
+* DTK驱动：dtk25.04.1
+* Pytorch: 2.5.1
+* triton: 3.0.0
+* lmslim: 0.3.1
+* flash_attn: 2.6.1
+* flash_mla: 1.0.0
+* lightop: 0.5.0
+
+
 #### 源码编译安装
+1. 下载源码并进入目录
 ```shell
-git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
+git clone -b v0.9.2 https://github.com/vllm-project/vllm.git
+cd vllm
 ```
-安装依赖:
+
+2. patch生成与执行(若单独打patch执行可忽略):
+- 生成
 ```shell
-pip install -r requirements/rocm.txt
+diff -Naur v0.9.2 patch-0.9.2+das.opt1.rc2.dtk2504 > patch_vllm.patch
 ```
- 提供2种源码编译方式(进入vllm目录):
+
+- 执行
+```shell
+patch -p1 < patch_vllm.patch
 ```
-1. 编译whl包并安装
-python setup.py bdist_wheel 
-cd dist
-pip install vllm*

-2. 源码编译安装
-python3 setup.py install （若调试，可使用python3 setup.py develop）
+3. 获取manylinux so并添加
+
+- 需要将该包安装目录下的_C.abi3.so和_moe_C.abi3.so拷贝至/opt/dtk/并添加软链接至vllm
+```shell
+cp /usr/local/lib/python3.10/dist-packages/vllm/*.so /opt/dtk/
+ln -s /opt/dtk/*.so vllm/
 ```
-若需要添加git号，设置环境变量: export ADD_GIT_VERSION=1

-#### 运行基础环境准备
-1、使用上面基于光源pytorch2.5.1基础镜像环境
+4. 安装依赖:
+```shell
+pip install -r requirements/rocm.txt
+```

-2、根据pytorch2.5.1、python、dtk及系统下载对应的依赖包:
- triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
- lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)
+5. 编译及安装
+- 编译whl包并安装
+```shell
+python setup.py bdist_wheel 
+cd dist
+pip install vllm*
+```
+- 源码编译安装
+```shell
+pip install . --no-build-isolation
+```

 #### 注意事项
 + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
@@ -103,5 +76,4 @@ python3 setup.py install （若调试，可使用python3 setup.py develop）
 - 无

 ## 参考资料
- [README_ORIGIN](README_ORIGIN.md)
 - [https://github.com/vllm-project/vllm](https://github.com/vllm-project/vllm)
\ No newline at end of file
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -119,98 +119,24 @@ def main(

        for _ in range(num_iters):
            if version == "v1":
-                if args.gc_paged_attn:
-                    if args.tc_paged_attn:
-                        ops.paged_attention_v1_opt_tc(
-                            output,
-                            query,
-                            key_cache,
-                            value_cache,
-                            num_kv_heads,
-                            scale,
-                            block_tables,
-                            seq_lens,
-                            block_size,
-                            max_seq_len,
-                            alibi_slopes,
-                            kv_cache_dtype,
-                            k_scale,
-                            v_scale,
-                        )
-                    else:
-                        ops.paged_attention_v1_opt(
-                            output,
-                            query,
-                            key_cache,
-                            value_cache,
-                            num_kv_heads,
-                            scale,
-                            block_tables,
-                            seq_lens,
-                            block_size,
-                            max_seq_len,
-                            alibi_slopes,
-                            kv_cache_dtype,
-                            k_scale,
-                            v_scale,
-                        )
-                else:
-                    ops.paged_attention_v1(
-                    output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    num_kv_heads,
-                    scale,
-                    block_tables,
-                    seq_lens,
-                    block_size,
-                    max_seq_len,
-                    alibi_slopes,
-                    kv_cache_dtype,
-                    k_scale,
-                    v_scale,
-                )
+                ops.paged_attention_v1(
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
            elif version == "v2":
                if not args.custom_paged_attn:   
-                    if args.gc_paged_attn:     
-                        if args.tc_paged_attn:
-                            ops.paged_attention_v1_opt_tc(
-                                output,
-                                query,
-                                key_cache,
-                                value_cache,
-                                num_kv_heads,
-                                scale,
-                                block_tables,
-                                seq_lens,
-                                block_size,
-                                max_seq_len,
-                                alibi_slopes,
-                                kv_cache_dtype,
-                                k_scale,
-                                v_scale,
-                            )
-                        else:
-                            ops.paged_attention_v2_opt(
-                                output,
-                                exp_sums,
-                                max_logits,
-                                tmp_output,
-                                query,
-                                key_cache,
-                                value_cache,
-                                num_kv_heads,
-                                scale,
-                                block_tables,
-                                seq_lens,
-                                block_size,
-                                max_seq_len,
-                                alibi_slopes,
-                                kv_cache_dtype,
-                                k_scale,
-                                v_scale,
-                            )
                    ops.paged_attention_v2(
                        output,
                        exp_sums,
@@ -251,24 +177,6 @@ def main(
                        k_scale,
                        v_scale,
                    )
-            elif version == "v12":
-                from flash_attn import vllm_flash_attn_with_kvcache
-                vllm_flash_attn_with_kvcache(
-                    q=query.unsqueeze(1),  
-                    k_cache=key_cache,  
-                    v_cache=value_cache,  
-                    cache_seqlens=seq_lens,  
-                    block_table=block_tables, 
-                    softmax_scale=scale,
-                    causal=True,
-                    window_size=sliding_window,  
-                    softcap=logits_soft_cap,
-                    alibi_slopes=alibi_slopes,
-                    return_softmax_lse=False,
-                    k_scale=k_scale,  
-                    v_scale=v_scale, 
-                    kv_cache_dtype=kv_cache_dtype,  
-                ).squeeze(1) 
            else:
                raise ValueError(f"Invalid version: {version}")
        torch.cuda.synchronize()
@@ -298,7 +206,7 @@ if __name__ == "__main__":
    )

    parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
-    parser.add_argument("--version", type=str, choices=["v1", "v2", "v12"], default="v12")
+    parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2")
    parser.add_argument("--batch-size", type=int, default=8)
    parser.add_argument("--seq-len", type=int, default=4096)
    parser.add_argument("--num-query-heads", type=int, default=64)
@@ -324,12 +232,6 @@ if __name__ == "__main__":
        help="Data type for kv cache storage. If 'auto', will use model "
        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
        "ROCm (hcu) supports fp8 (=fp8_e4m3)")
-    parser.add_argument(
-        "--gc-paged-attn", action="store_true", help="Use gc paged attention"
-        )
-    parser.add_argument(
-        "--tc-paged-attn", action="store_true", help="Use tc paged attention"
-        )
    parser.add_argument(
        "--custom-paged-attn", action="store_true", help="Use custom paged attention"
    )

--- a/csrc/attention/attention_kernels_opt.cu
+++ b/csrc/attention/attention_kernels_opt.cu
--- a/csrc/attention/attention_kernels_opt_tc.cu
+++ b/csrc/attention/attention_kernels_opt_tc.cu
--- a/csrc/attention/attention_with_mask_kernels.cu
+++ b/csrc/attention/attention_with_mask_kernels.cu
--- a/csrc/attention/attention_with_mask_kernels_opt.cu
+++ b/csrc/attention/attention_with_mask_kernels_opt.cu
--- a/csrc/attention/attention_with_mask_kernels_opt_tc.cu
+++ b/csrc/attention/attention_with_mask_kernels_opt_tc.cu
--- a/csrc/attention/static_switch.h
+++ b/csrc/attention/static_switch.h
-#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
-  [&] {                                         \
-    if (COND) {                                 \
-      constexpr static bool CONST_NAME = true;  \
-      return __VA_ARGS__();                     \
-    } else {                                    \
-      constexpr static bool CONST_NAME = false; \
-      return __VA_ARGS__();                     \
-    }                                           \
-  }()
-
-#define OPT_SWITCH(COND, ...)      \
-  [&] {                                         \
-    if (COND) {                                 \
-      constexpr static int opt = 1;  \
-      return __VA_ARGS__();                     \
-    } else {                                    \
-      constexpr static int opt = 2; \
-      return __VA_ARGS__();                     \
-    }                                           \
-  }()
-
-#define NUM_THREADS_SWITCH(NUM_THREAD, ...)    \
-  [&] {                                         \
-    if (NUM_THREAD == 256) {                   \
-      constexpr static int NUM_THREADS = 256;  \
-      return __VA_ARGS__();                     \
-    } else {                                    \
-      constexpr static int NUM_THREADS = 128;  \
-      return __VA_ARGS__();                     \
-    }                                           \
-  }()
-
-  #define HEADSIZE_SWITCH(HEADDIM, ...)   \
-  [&] {                                    \
-    if (HEADDIM == 32) {                   \
-      constexpr static int HEAD_SIZE = 32;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 64) {            \
-      constexpr static int HEAD_SIZE = 64;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 80) {            \
-      constexpr static int HEAD_SIZE = 80;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 96) {            \
-      constexpr static int HEAD_SIZE = 96;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 112) {           \
-      constexpr static int HEAD_SIZE = 112; \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 128) {           \
-      constexpr static int HEAD_SIZE = 128; \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 192) {           \
-      constexpr static int HEAD_SIZE = 192; \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 256) {           \
-      constexpr static int HEAD_SIZE = 256; \
-      return __VA_ARGS__();                \
-    }                                      \
-    else {                                 \
-      TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
-    }                                      \
-  }()
-
-#define REUSEKV_SWITCH(num_blocks , ...)      \
-[&] {                                                   \
-    if (num_heads % 2 == 0 && num_heads / num_kv_heads >= 4 && num_blocks >= 1200){      \
-        constexpr static int REUSE_KV_TIMES = 4;        \
-        return __VA_ARGS__();                           \
-    } else if (num_heads / num_kv_heads >= 2 && num_blocks >= 1200){\
-        constexpr static int REUSE_KV_TIMES = 2;        \
-        return __VA_ARGS__();                           \
-    } else {                                            \
-        constexpr static int REUSE_KV_TIMES = 1;        \
-        return __VA_ARGS__();                           \
-    }                                                   \
-}()
-
-#define REUSEKV_SWITCH_V1(num_blocks , ...)      \
-[&] {                                                   \
-    if (num_heads > num_kv_heads && num_blocks >= 1200){      \
-        constexpr static int REUSE_KV_TIMES = 2;        \
-        return __VA_ARGS__();                           \
-    }  else {                                           \
-        constexpr static int REUSE_KV_TIMES = 1;        \
-        return __VA_ARGS__();                           \
-    }                                                   \
-}()
-
--- a/csrc/attention/static_switch_tc.h
+++ b/csrc/attention/static_switch_tc.h
-#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
-  [&] {                                         \
-    if (COND) {                                 \
-      constexpr static bool CONST_NAME = true;  \
-      return __VA_ARGS__();                     \
-    } else {                                    \
-      constexpr static bool CONST_NAME = false; \
-      return __VA_ARGS__();                     \
-    }                                           \
-  }()
-
-#define NUM_THREADS_SWITCH(NUM_THREAD, ...)    \
-  [&] {                                         \
-    if (NUM_THREAD == 256) {                   \
-      constexpr static int NUM_THREADS = 256;  \
-      return __VA_ARGS__();                     \
-    }else if (NUM_THREAD == 128) {                 \
-      constexpr static int NUM_THREADS = 128;  \
-      return __VA_ARGS__();                     \
-    } else {                                    \
-      constexpr static int NUM_THREADS = 64;  \
-      return __VA_ARGS__();                     \
-    }                                           \
-  }()
-
-  #define HEADSIZE_SWITCH(HEADDIM, ...)   \
-  [&] {                                    \
-    if (HEADDIM == 32) {                   \
-      constexpr static int HEAD_SIZE = 32;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 64) {            \
-      constexpr static int HEAD_SIZE = 64;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 80) {            \
-      constexpr static int HEAD_SIZE = 80;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 96) {            \
-      constexpr static int HEAD_SIZE = 96;  \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 112) {           \
-      constexpr static int HEAD_SIZE = 112; \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 128) {           \
-      constexpr static int HEAD_SIZE = 128; \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 192) {           \
-      constexpr static int HEAD_SIZE = 192; \
-      return __VA_ARGS__();                \
-    } else if (HEADDIM == 256) {           \
-      constexpr static int HEAD_SIZE = 256; \
-      return __VA_ARGS__();                \
-    }                                      \
-    else {                                 \
-      TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
-    }                                      \
-  }()
-
-#define REUSEKV_SWITCH(reusekv,...)      \
-[&] {                                                   \
-    if (reusekv==16){      \
-        constexpr static int REUSE_KV_TIMES = 16;        \
-        return __VA_ARGS__();}                           \
-    else if (reusekv==8){      \
-        constexpr static int REUSE_KV_TIMES = 8;        \
-        return __VA_ARGS__();                           \
-    }else if (reusekv==4){      \
-        constexpr static int REUSE_KV_TIMES = 4;        \
-        return __VA_ARGS__();                           \
-    }else if (reusekv==2){      \
-        constexpr static int REUSE_KV_TIMES = 2;        \
-        return __VA_ARGS__();                           \
-    }else {                                           \
-        constexpr static int REUSE_KV_TIMES = 1;        \
-        return __VA_ARGS__();                           \
-    }                                                   \
-}()
-
-#define USEVMAC_SWITCH_V1(num_blocks , ...)      \
-[&] {                                                   \
-    if (REUSE_KV_TIMES==1&&(num_blocks >2500 || padded_max_seq_len > 2048)){      \
-        constexpr static int use_vmac = false;        \
-        return __VA_ARGS__();                           \
-    }  else {                                           \
-        constexpr static int use_vmac = true;        \
-        return __VA_ARGS__();                           \
-    }                                                   \
-}()
\ No newline at end of file
--- a/csrc/ops.h
+++ b/csrc/ops.h
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -147,8 +147,6 @@ if TYPE_CHECKING:
    VLLM_USE_TRITON_OPT_MLA: bool = False
    VLLM_USE_FLASH_MLA: bool = False
    VLLM_USE_OPT_OP: bool = False
-    VLLM_USE_TC_PAGED_ATTN: bool = False
-    VLLM_USE_PA_PRINT_PARAM: bool = False 
    VLLM_SPEC_DECODE_EAGER: bool = False
    VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
    VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
@@ -1017,16 +1015,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in
             ("true", "1")),
    
-    # flag to control vllm to use optimized tc paged attn kernels
-    "VLLM_USE_TC_PAGED_ATTN":
-    lambda: (os.environ.get("VLLM_USE_TC_PAGED_ATTN", "True").lower() in
-             ("true", "1")),
-    
-    # flag to control if vllm print pa parameters
-    "VLLM_USE_PA_PRINT_PARAM":
-    lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
-             ("true", "1")),
-    
    # If set, vLLM will disable the draft model in cudagraph mode.
    "VLLM_SPEC_DECODE_EAGER":
    lambda: bool(int(os.getenv("VLLM_SPEC_DECODE_EAGER", "0"))),