update benchmark_paged_attention.py and ops.h of convert_vertical_slash_indexes

c004bf6e · zhuwenwen · 98f67566 · c004bf6e · c004bf6e
Commit c004bf6e authored Dec 13, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 97 deletions

benchmarks/kernels/benchmark_paged_attention.py benchmarks/kernels/benchmark_paged_attention.py +16 -96

csrc/ops.h csrc/ops.h +1 -1

No files found.
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -117,98 +117,24 @@ def main(
        for _ in range(num_iters):
            if version == "v1":
-                if args.gc_paged_attn:
+                ops.paged_attention_v1(
-                    if args.tc_paged_attn:
+                output,
-                        ops.paged_attention_v1_opt_tc(
+                query,
-                            output,
+                key_cache,
-                            query,
+                value_cache,
-                            key_cache,
+                num_kv_heads,
-                            value_cache,
+                scale,
-                            num_kv_heads,
+                block_tables,
-                            scale,
+                seq_lens,
-                            block_tables,
+                block_size,
-                            seq_lens,
+                max_seq_len,
-                            block_size,
+                alibi_slopes,
-                            max_seq_len,
+                kv_cache_dtype,
-                            alibi_slopes,
+                k_scale,
-                            kv_cache_dtype,
+                v_scale,
-                            k_scale,
+            )
-                            v_scale,
-                        )
-                    else:
-                        ops.paged_attention_v1_opt(
-                            output,
-                            query,
-                            key_cache,
-                            value_cache,
-                            num_kv_heads,
-                            scale,
-                            block_tables,
-                            seq_lens,
-                            block_size,
-                            max_seq_len,
-                            alibi_slopes,
-                            kv_cache_dtype,
-                            k_scale,
-                            v_scale,
-                        )
-                else:
-                    ops.paged_attention_v1(
-                    output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    num_kv_heads,
-                    scale,
-                    block_tables,
-                    seq_lens,
-                    block_size,
-                    max_seq_len,
-                    alibi_slopes,
-                    kv_cache_dtype,
-                    k_scale,
-                    v_scale,
-                )
            elif version == "v2":
                if not args.custom_paged_attn:   
-                    if args.gc_paged_attn:     
-                        if args.tc_paged_attn:
-                            ops.paged_attention_v1_opt_tc(
-                                output,
-                                query,
-                                key_cache,
-                                value_cache,
-                                num_kv_heads,
-                                scale,
-                                block_tables,
-                                seq_lens,
-                                block_size,
-                                max_seq_len,
-                                alibi_slopes,
-                                kv_cache_dtype,
-                                k_scale,
-                                v_scale,
-                            )
-                        else:
-                            ops.paged_attention_v2_opt(
-                                output,
-                                exp_sums,
-                                max_logits,
-                                tmp_output,
-                                query,
-                                key_cache,
-                                value_cache,
-                                num_kv_heads,
-                                scale,
-                                block_tables,
-                                seq_lens,
-                                block_size,
-                                max_seq_len,
-                                alibi_slopes,
-                                kv_cache_dtype,
-                                k_scale,
-                                v_scale,
-                            )
                    ops.paged_attention_v2(
                        output,
                        exp_sums,
@@ -322,12 +248,6 @@ if __name__ == "__main__":
        help="Data type for kv cache storage. If 'auto', will use model "
        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
        "ROCm (hcu) supports fp8 (=fp8_e4m3)")
-    parser.add_argument(
-        "--gc-paged-attn", action="store_true", help="Use gc paged attention"
-        )
-    parser.add_argument(
-        "--tc-paged-attn", action="store_true", help="Use tc paged attention"
-        )
    parser.add_argument(
        "--custom-paged-attn", action="store_true", help="Use custom paged attention"
    )

--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -59,7 +59,7 @@ void merge_attn_states(torch::Tensor& output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse);
-#ifndef USE_ROCM
 void convert_vertical_slash_indexes(
    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]