issue/979 - removed commented paged attn codes

4cd1f688 · wooway777 · 1c18c046 · 4cd1f688 · 4cd1f688
Commit 4cd1f688 authored Jan 26, 2026 by wooway777
2 changed files
--- a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
+++ b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
@@ -212,151 +212,3 @@ infiniStatus_t Descriptor::calculate(
 }
 } // namespace op::paged_attention::nvidia
-// #include <cub/block/block_reduce.cuh>
-// #include "../../../devices/nvidia/nvidia_common.cuh"
-// #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-// #include "../../../reduce/cuda/reduce.cuh"
-// #include "../cuda/kernel.cuh"
-// #include "paged_attention_nvidia.cuh"
-// template <typename Tdata, typename Tcompute, size_t HEAD_SIZE, size_t NUM_THREADS>
-// INFINIOP_CUDA_KERNEL pagedAttention(
-//     Tdata *out, const Tdata *q, const Tdata *k_cache, const Tdata *v_cache,
-//     const int64_t *block_tables, const int64_t *seq_lens, const float *alibi_slopes,
-//     const size_t num_kv_heads, const float scale, const size_t max_num_blocks_per_seq,
-//     const size_t block_size,
-//     const ptrdiff_t q_stride,
-//     const ptrdiff_t kv_block_stride,
-//     const ptrdiff_t kv_head_stride,
-//     const ptrdiff_t o_stride) {
-//     op::paged_attention::cuda::pagedAttentionKernel<Tdata, Tcompute, HEAD_SIZE, NUM_THREADS>(
-//         out, q, k_cache, v_cache, block_tables, seq_lens, alibi_slopes, num_kv_heads, scale,
-//         max_num_blocks_per_seq, block_size, q_stride, kv_block_stride, kv_head_stride, o_stride);
-// }
-// namespace op::paged_attention::nvidia {
-// struct Descriptor::Opaque {
-//     std::shared_ptr<device::nvidia::Handle::Internal> internal;
-// };
-// Descriptor::~Descriptor() {
-//     delete _opaque;
-// }
-// infiniStatus_t Descriptor::create(
-//     infiniopHandle_t handle,
-//     Descriptor **desc_ptr,
-//     infiniopTensorDescriptor_t out_desc,
-//     infiniopTensorDescriptor_t q_desc,
-//     infiniopTensorDescriptor_t k_cache_desc,
-//     infiniopTensorDescriptor_t v_cache_desc,
-//     infiniopTensorDescriptor_t block_tables_desc,
-//     infiniopTensorDescriptor_t seq_lens_desc,
-//     const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
-//     float scale) {
-//     auto info = PagedAttentionInfo::create(out_desc, q_desc, k_cache_desc, v_cache_desc, block_tables_desc, seq_lens_desc, alibi_slopes_desc, scale);
-//     CHECK_RESULT(info);
-//     *desc_ptr = new Descriptor(
-//         new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-//         info.take(), 0, handle->device, handle->device_id);
-//     return INFINI_STATUS_SUCCESS;
-// }
-// template <size_t HEAD_SIZE, size_t NUM_THREADS>
-// infiniStatus_t launchKernel(void *out, const void *q, const void *k_cache, const void *v_cache,
-//                             infiniDtype_t dtype,
-//                             const void *block_tables, const void *seq_lens, const void *alibi_slopes,
-//                             size_t num_heads, size_t num_seqs,
-//                             size_t num_kv_heads, float scale, size_t max_num_blocks_per_seq, size_t block_size,
-//                             ptrdiff_t q_stride, ptrdiff_t kv_block_stride, ptrdiff_t kv_head_stride, ptrdiff_t o_stride,
-//                             cudaStream_t stream) {
-//     dim3 grid(uint64_t(num_heads), uint64_t(num_seqs), 1);
-//     dim3 block(NUM_THREADS);
-//     size_t shared_mem_size = (HEAD_SIZE + max_num_blocks_per_seq * block_size + 2) * sizeof(float);
-//     if (dtype == INFINI_DTYPE_F16) {
-//         pagedAttention<half, float, HEAD_SIZE, NUM_THREADS>
-//             <<<grid, block, shared_mem_size, stream>>>(
-//                 (half *)out,
-//                 (const half *)q, (const half *)k_cache, (const half *)v_cache,
-//                 (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
-//                 scale, max_num_blocks_per_seq, block_size,
-//                 q_stride, kv_block_stride, kv_head_stride, o_stride);
-//     } else if (dtype == INFINI_DTYPE_BF16) {
-//         pagedAttention<__nv_bfloat16, float, HEAD_SIZE, NUM_THREADS>
-//             <<<grid, block, shared_mem_size, stream>>>(
-//                 (__nv_bfloat16 *)out, (const __nv_bfloat16 *)q, (const __nv_bfloat16 *)k_cache, (const __nv_bfloat16 *)v_cache,
-//                 (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
-//                 scale, max_num_blocks_per_seq, block_size,
-//                 q_stride, kv_block_stride, kv_head_stride, o_stride);
-//     } else if (dtype == INFINI_DTYPE_F32) {
-//         pagedAttention<float, float, HEAD_SIZE, NUM_THREADS>
-//             <<<grid, block, shared_mem_size, stream>>>(
-//                 (float *)out, (const float *)q, (const float *)k_cache, (const float *)v_cache,
-//                 (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
-//                 scale, max_num_blocks_per_seq, block_size,
-//                 q_stride, kv_block_stride, kv_head_stride, o_stride);
-//     } else {
-//         return INFINI_STATUS_BAD_TENSOR_DTYPE;
-//     }
-//     return INFINI_STATUS_SUCCESS;
-// }
-// infiniStatus_t Descriptor::calculate(
-//     void *workspace, size_t workspace_size,
-//     void *out, const void *q, const void *k_cache, const void *v_cache,
-//     const void *block_tables, const void *seq_lens, const void *alibi_slopes,
-//     void *stream_) const {
-//     cudaStream_t stream = (cudaStream_t)stream_;
-// #define LAUNCH_HEADSIZE_BLOCKSIZE(__H_SIZE, __B_SIZE)                                    \
-//     launchKernel<__H_SIZE, __B_SIZE>(                                                    \
-//         out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes,     \
-//         _info.num_heads, _info.num_seqs,                                                 \
-//         _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, \
-//         _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride,     \
-//         stream);
-// #define SWITCH_HEAD_SIZE(__B_SIZE)               \
-//     switch (_info.head_size) {                   \
-//     case 16:                                     \
-//         LAUNCH_HEADSIZE_BLOCKSIZE(16, __B_SIZE)  \
-//         break;                                   \
-//     case 32:                                     \
-//         LAUNCH_HEADSIZE_BLOCKSIZE(32, __B_SIZE)  \
-//         break;                                   \
-//     case 64:                                     \
-//         LAUNCH_HEADSIZE_BLOCKSIZE(64, __B_SIZE)  \
-//         break;                                   \
-//     case 128:                                    \
-//         LAUNCH_HEADSIZE_BLOCKSIZE(128, __B_SIZE) \
-//         break;                                   \
-//     case 256:                                    \
-//         LAUNCH_HEADSIZE_BLOCKSIZE(256, __B_SIZE) \
-//         break;                                   \
-//     default:                                     \
-//         return INFINI_STATUS_BAD_TENSOR_SHAPE;   \
-//     }
-//     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
-//         SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_1024)
-//     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
-//         SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_512)
-//     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-//         SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_4096)
-//     } else {
-//         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-//     }
-// #undef LAUNCH_HEADSIZE_BLOCKSIZE
-// #undef SWITCH_HEAD_SIZE
-//     return INFINI_STATUS_SUCCESS;
-// }
-// } // namespace op::paged_attention::nvidia
--- a/src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
+++ b/src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
@@ -1285,45 +1285,6 @@ infiniStatus_t Descriptor::create(
    const size_t n = info->total_q_tokens * info->num_heads;
    const size_t splitkv_workspace_bytes = use_splitkv ? (static_cast<size_t>(num_splits) * n * (info->head_size + 2) * sizeof(float)) : 0;
-    // FA2-style kernel needs a workspace scratch for:
-    // - converting block_tables + total_kv_lens to int32
-    // - storing softmax LSE (only required to satisfy the upstream kernel contract)
-    // bool want_fa2 = false;
-    // if (const char *k_env = std::getenv("INFINIOP_FLASH_PREFILL_KERNEL")) {
-    //     want_fa2 = (std::strcmp(k_env, "fa2") == 0);
-    // }
-    // bool fa2_materialize_kv = false;
-    // if (const char *env = std::getenv("INFINIOP_FA2_MATERIALIZE_PAGED_KV")) {
-    //     fa2_materialize_kv = (std::strcmp(env, "1") == 0) || (std::strcmp(env, "true") == 0);
-    // }
-    // size_t fa2_workspace_bytes = 0;
-    // // FA2 prefill supports both fp16 and bf16 inputs (head_dim=128, block_size=256).
-    // // Workspace sizing is identical since both are 16-bit element types.
-    // if (want_fa2 && (info->dtype == INFINI_DTYPE_F16 || info->dtype == INFINI_DTYPE_BF16) &&
-    //     info->head_size == 128 && info->page_block_size == 256) {
-    //     const size_t bt_bytes = info->num_seqs * info->max_num_blocks_per_seq * sizeof(int);
-    //     const size_t len_bytes = info->num_seqs * sizeof(int);
-    //     const size_t cuq_bytes = (info->num_seqs + 1) * sizeof(int);
-    //     const size_t cuk_bytes = (info->num_seqs + 1) * sizeof(int);
-    //     const size_t lse_bytes = info->num_heads * info->total_q_tokens * sizeof(float);
-    //     // Add a small alignment slack since we sub-allocate with alignment.
-    //     fa2_workspace_bytes = bt_bytes + len_bytes + cuq_bytes + cuk_bytes + lse_bytes + 64;
-    //     // Optional: materialize paged KV into the FA2-friendly physical layout
-    //     // [num_blocks, page_block_size, kv_heads, head_dim] (token-major) to avoid
-    //     // extremely strided loads when the framework stores KV as
-    //     // [num_blocks, kv_heads, page_block_size, head_dim] (head-major).
-    //     if (fa2_materialize_kv) {
-    //         // Materialize per-seq contiguous KV in *sequence order*:
-    //         // [num_seqs, max_num_blocks_per_seq * page_block_size, kv_heads, head_dim].
-    //         const size_t kv_elems =
-    //             info->num_seqs * info->max_num_blocks_per_seq * info->page_block_size * info->num_kv_heads * info->head_size;
-    //         const size_t kv_bytes = kv_elems * sizeof(uint16_t); // 16-bit (fp16/bf16)
-    //         // K + V + alignment slack
-    //         fa2_workspace_bytes += 2 * kv_bytes + 64;
-    //     }
-    // }
    const size_t workspace_bytes = splitkv_workspace_bytes;
    // const size_t workspace_bytes = splitkv_workspace_bytes + fa2_workspace_bytes;
@@ -1587,130 +1548,3 @@ infiniStatus_t Descriptor::calculate(
 }
 } // namespace op::paged_attention_prefill::nvidia
-// #include <cuda_fp16.h>
-// #include <float.h>
-// #include <math.h>
-// #include <stdint.h>
-// #include "../../../devices/nvidia/nvidia_common.cuh"
-// #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-// #include "../cuda/kernel.cuh"
-// #include "paged_attention_prefill_nvidia.cuh"
-// template <typename Tdata, typename Tcompute>
-// infiniStatus_t launchPagedAttentionPrefill(
-//     Tdata *out, const Tdata *q, const Tdata *k_cache, const Tdata *v_cache,
-//     const int64_t *block_tables,
-//     const int64_t *seq_lens,
-//     const int64_t *cum_seq_lens_q,
-//     const float *alibi_slopes,
-//     const size_t num_heads,
-//     const size_t num_seqs,
-//     const size_t num_kv_heads,
-//     const float scale,
-//     const size_t max_num_blocks_per_seq,
-//     const size_t block_size,
-//     const size_t total_q_tokens,
-//     const size_t head_size,
-//     const ptrdiff_t kv_block_stride,
-//     const ptrdiff_t kv_head_stride,
-//     const ptrdiff_t q_stride,
-//     const ptrdiff_t q_head_stride,
-//     cudaStream_t stream) {
-//     if (total_q_tokens == 0 || num_heads == 0) {
-//         return INFINI_STATUS_BAD_TENSOR_SHAPE;
-//     }
-//     dim3 grid(total_q_tokens, num_heads);
-//     dim3 block(head_size);
-//     op::paged_attention_prefill::cuda::pagedAttentionPrefillKernel<Tdata, Tcompute>
-//         <<<grid, block, 0, stream>>>(
-//             out, q, k_cache, v_cache,
-//             block_tables, seq_lens, cum_seq_lens_q, alibi_slopes,
-//             num_heads, num_kv_heads, scale,
-//             max_num_blocks_per_seq, block_size,
-//             kv_block_stride, kv_head_stride,
-//             q_stride, q_head_stride,
-//             head_size,
-//             num_seqs);
-//     return INFINI_STATUS_SUCCESS;
-// }
-// namespace op::paged_attention_prefill::nvidia {
-// struct Descriptor::Opaque {
-//     std::shared_ptr<device::nvidia::Handle::Internal> internal;
-// };
-// Descriptor::~Descriptor() {
-//     delete _opaque;
-// }
-// infiniStatus_t Descriptor::create(
-//     infiniopHandle_t handle,
-//     Descriptor **desc_ptr,
-//     infiniopTensorDescriptor_t out_desc,
-//     infiniopTensorDescriptor_t q_desc,
-//     infiniopTensorDescriptor_t k_cache_desc,
-//     infiniopTensorDescriptor_t v_cache_desc,
-//     infiniopTensorDescriptor_t block_tables_desc,
-//     infiniopTensorDescriptor_t seq_lens_desc,
-//     infiniopTensorDescriptor_t cum_seq_lens_q_desc,
-//     const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
-//     float scale) {
-//     auto info = PagedAttentionPrefillInfo::create(
-//         out_desc, q_desc, k_cache_desc, v_cache_desc,
-//         block_tables_desc, seq_lens_desc,
-//         cum_seq_lens_q_desc,
-//         alibi_slopes_desc, scale);
-//     CHECK_RESULT(info);
-//     *desc_ptr = new Descriptor(
-//         new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-//         info.take(), 0, handle->device, handle->device_id);
-//     return INFINI_STATUS_SUCCESS;
-// }
-// infiniStatus_t Descriptor::calculate(
-//     void *workspace, size_t workspace_size,
-//     void *out, const void *q, const void *k_cache, const void *v_cache,
-//     const void *block_tables,
-//     const void *seq_lens,
-//     const void *cum_seq_lens_q,
-//     const void *alibi_slopes,
-//     void *stream_) const {
-//     cudaStream_t stream = (cudaStream_t)stream_;
-// #define LAUNCH_KERNEL(Tdata, Tcompute)                                                             \
-//     launchPagedAttentionPrefill<Tdata, Tcompute>(                                                  \
-//         (Tdata *)out, (const Tdata *)q, (const Tdata *)k_cache, (const Tdata *)v_cache,            \
-//         (const int64_t *)block_tables, (const int64_t *)seq_lens, (const int64_t *)cum_seq_lens_q, \
-//         (const float *)alibi_slopes,                                                               \
-//         _info.num_heads, _info.num_seqs, _info.num_kv_heads,                                       \
-//         _info.scale, _info.max_num_blocks_per_seq,                                                 \
-//         _info.block_size, _info.total_q_tokens,                                                    \
-//         _info.head_size,                                                                           \
-//         _info.kv_block_stride, _info.kv_head_stride,                                               \
-//         _info.q_stride, _info.q_head_stride,                                                       \
-//         stream)
-//     if (_info.dtype == INFINI_DTYPE_F16) {
-//         return LAUNCH_KERNEL(half, float);
-//     } else if (_info.dtype == INFINI_DTYPE_BF16) {
-//         return LAUNCH_KERNEL(__nv_bfloat16, float);
-//     } else if (_info.dtype == INFINI_DTYPE_F32) {
-//         return LAUNCH_KERNEL(float, float);
-//     }
-//     return INFINI_STATUS_BAD_TENSOR_DTYPE;
-// }
-// } // namespace op::paged_attention_prefill::nvidia