Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
4cd1f688
Commit
4cd1f688
authored
Jan 26, 2026
by
wooway777
Browse files
issue/979 - removed commented paged attn codes
parent
1c18c046
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
314 deletions
+0
-314
src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
...niop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
+0
-148
src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
...ttention_prefill/nvidia/paged_attention_prefill_nvidia.cu
+0
-166
No files found.
src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
View file @
4cd1f688
...
@@ -212,151 +212,3 @@ infiniStatus_t Descriptor::calculate(
...
@@ -212,151 +212,3 @@ infiniStatus_t Descriptor::calculate(
}
}
}
// namespace op::paged_attention::nvidia
}
// namespace op::paged_attention::nvidia
// #include <cub/block/block_reduce.cuh>
// #include "../../../devices/nvidia/nvidia_common.cuh"
// #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
// #include "../../../reduce/cuda/reduce.cuh"
// #include "../cuda/kernel.cuh"
// #include "paged_attention_nvidia.cuh"
// template <typename Tdata, typename Tcompute, size_t HEAD_SIZE, size_t NUM_THREADS>
// INFINIOP_CUDA_KERNEL pagedAttention(
// Tdata *out, const Tdata *q, const Tdata *k_cache, const Tdata *v_cache,
// const int64_t *block_tables, const int64_t *seq_lens, const float *alibi_slopes,
// const size_t num_kv_heads, const float scale, const size_t max_num_blocks_per_seq,
// const size_t block_size,
// const ptrdiff_t q_stride,
// const ptrdiff_t kv_block_stride,
// const ptrdiff_t kv_head_stride,
// const ptrdiff_t o_stride) {
// op::paged_attention::cuda::pagedAttentionKernel<Tdata, Tcompute, HEAD_SIZE, NUM_THREADS>(
// out, q, k_cache, v_cache, block_tables, seq_lens, alibi_slopes, num_kv_heads, scale,
// max_num_blocks_per_seq, block_size, q_stride, kv_block_stride, kv_head_stride, o_stride);
// }
// namespace op::paged_attention::nvidia {
// struct Descriptor::Opaque {
// std::shared_ptr<device::nvidia::Handle::Internal> internal;
// };
// Descriptor::~Descriptor() {
// delete _opaque;
// }
// infiniStatus_t Descriptor::create(
// infiniopHandle_t handle,
// Descriptor **desc_ptr,
// infiniopTensorDescriptor_t out_desc,
// infiniopTensorDescriptor_t q_desc,
// infiniopTensorDescriptor_t k_cache_desc,
// infiniopTensorDescriptor_t v_cache_desc,
// infiniopTensorDescriptor_t block_tables_desc,
// infiniopTensorDescriptor_t seq_lens_desc,
// const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
// float scale) {
// auto info = PagedAttentionInfo::create(out_desc, q_desc, k_cache_desc, v_cache_desc, block_tables_desc, seq_lens_desc, alibi_slopes_desc, scale);
// CHECK_RESULT(info);
// *desc_ptr = new Descriptor(
// new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
// info.take(), 0, handle->device, handle->device_id);
// return INFINI_STATUS_SUCCESS;
// }
// template <size_t HEAD_SIZE, size_t NUM_THREADS>
// infiniStatus_t launchKernel(void *out, const void *q, const void *k_cache, const void *v_cache,
// infiniDtype_t dtype,
// const void *block_tables, const void *seq_lens, const void *alibi_slopes,
// size_t num_heads, size_t num_seqs,
// size_t num_kv_heads, float scale, size_t max_num_blocks_per_seq, size_t block_size,
// ptrdiff_t q_stride, ptrdiff_t kv_block_stride, ptrdiff_t kv_head_stride, ptrdiff_t o_stride,
// cudaStream_t stream) {
// dim3 grid(uint64_t(num_heads), uint64_t(num_seqs), 1);
// dim3 block(NUM_THREADS);
// size_t shared_mem_size = (HEAD_SIZE + max_num_blocks_per_seq * block_size + 2) * sizeof(float);
// if (dtype == INFINI_DTYPE_F16) {
// pagedAttention<half, float, HEAD_SIZE, NUM_THREADS>
// <<<grid, block, shared_mem_size, stream>>>(
// (half *)out,
// (const half *)q, (const half *)k_cache, (const half *)v_cache,
// (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
// scale, max_num_blocks_per_seq, block_size,
// q_stride, kv_block_stride, kv_head_stride, o_stride);
// } else if (dtype == INFINI_DTYPE_BF16) {
// pagedAttention<__nv_bfloat16, float, HEAD_SIZE, NUM_THREADS>
// <<<grid, block, shared_mem_size, stream>>>(
// (__nv_bfloat16 *)out, (const __nv_bfloat16 *)q, (const __nv_bfloat16 *)k_cache, (const __nv_bfloat16 *)v_cache,
// (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
// scale, max_num_blocks_per_seq, block_size,
// q_stride, kv_block_stride, kv_head_stride, o_stride);
// } else if (dtype == INFINI_DTYPE_F32) {
// pagedAttention<float, float, HEAD_SIZE, NUM_THREADS>
// <<<grid, block, shared_mem_size, stream>>>(
// (float *)out, (const float *)q, (const float *)k_cache, (const float *)v_cache,
// (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
// scale, max_num_blocks_per_seq, block_size,
// q_stride, kv_block_stride, kv_head_stride, o_stride);
// } else {
// return INFINI_STATUS_BAD_TENSOR_DTYPE;
// }
// return INFINI_STATUS_SUCCESS;
// }
// infiniStatus_t Descriptor::calculate(
// void *workspace, size_t workspace_size,
// void *out, const void *q, const void *k_cache, const void *v_cache,
// const void *block_tables, const void *seq_lens, const void *alibi_slopes,
// void *stream_) const {
// cudaStream_t stream = (cudaStream_t)stream_;
// #define LAUNCH_HEADSIZE_BLOCKSIZE(__H_SIZE, __B_SIZE) \
// launchKernel<__H_SIZE, __B_SIZE>( \
// out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, \
// _info.num_heads, _info.num_seqs, \
// _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, \
// _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride, \
// stream);
// #define SWITCH_HEAD_SIZE(__B_SIZE) \
// switch (_info.head_size) { \
// case 16: \
// LAUNCH_HEADSIZE_BLOCKSIZE(16, __B_SIZE) \
// break; \
// case 32: \
// LAUNCH_HEADSIZE_BLOCKSIZE(32, __B_SIZE) \
// break; \
// case 64: \
// LAUNCH_HEADSIZE_BLOCKSIZE(64, __B_SIZE) \
// break; \
// case 128: \
// LAUNCH_HEADSIZE_BLOCKSIZE(128, __B_SIZE) \
// break; \
// case 256: \
// LAUNCH_HEADSIZE_BLOCKSIZE(256, __B_SIZE) \
// break; \
// default: \
// return INFINI_STATUS_BAD_TENSOR_SHAPE; \
// }
// if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
// SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_1024)
// } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
// SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_512)
// } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
// SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_4096)
// } else {
// return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
// }
// #undef LAUNCH_HEADSIZE_BLOCKSIZE
// #undef SWITCH_HEAD_SIZE
// return INFINI_STATUS_SUCCESS;
// }
// } // namespace op::paged_attention::nvidia
src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
View file @
4cd1f688
...
@@ -1285,45 +1285,6 @@ infiniStatus_t Descriptor::create(
...
@@ -1285,45 +1285,6 @@ infiniStatus_t Descriptor::create(
const
size_t
n
=
info
->
total_q_tokens
*
info
->
num_heads
;
const
size_t
n
=
info
->
total_q_tokens
*
info
->
num_heads
;
const
size_t
splitkv_workspace_bytes
=
use_splitkv
?
(
static_cast
<
size_t
>
(
num_splits
)
*
n
*
(
info
->
head_size
+
2
)
*
sizeof
(
float
))
:
0
;
const
size_t
splitkv_workspace_bytes
=
use_splitkv
?
(
static_cast
<
size_t
>
(
num_splits
)
*
n
*
(
info
->
head_size
+
2
)
*
sizeof
(
float
))
:
0
;
// FA2-style kernel needs a workspace scratch for:
// - converting block_tables + total_kv_lens to int32
// - storing softmax LSE (only required to satisfy the upstream kernel contract)
// bool want_fa2 = false;
// if (const char *k_env = std::getenv("INFINIOP_FLASH_PREFILL_KERNEL")) {
// want_fa2 = (std::strcmp(k_env, "fa2") == 0);
// }
// bool fa2_materialize_kv = false;
// if (const char *env = std::getenv("INFINIOP_FA2_MATERIALIZE_PAGED_KV")) {
// fa2_materialize_kv = (std::strcmp(env, "1") == 0) || (std::strcmp(env, "true") == 0);
// }
// size_t fa2_workspace_bytes = 0;
// // FA2 prefill supports both fp16 and bf16 inputs (head_dim=128, block_size=256).
// // Workspace sizing is identical since both are 16-bit element types.
// if (want_fa2 && (info->dtype == INFINI_DTYPE_F16 || info->dtype == INFINI_DTYPE_BF16) &&
// info->head_size == 128 && info->page_block_size == 256) {
// const size_t bt_bytes = info->num_seqs * info->max_num_blocks_per_seq * sizeof(int);
// const size_t len_bytes = info->num_seqs * sizeof(int);
// const size_t cuq_bytes = (info->num_seqs + 1) * sizeof(int);
// const size_t cuk_bytes = (info->num_seqs + 1) * sizeof(int);
// const size_t lse_bytes = info->num_heads * info->total_q_tokens * sizeof(float);
// // Add a small alignment slack since we sub-allocate with alignment.
// fa2_workspace_bytes = bt_bytes + len_bytes + cuq_bytes + cuk_bytes + lse_bytes + 64;
// // Optional: materialize paged KV into the FA2-friendly physical layout
// // [num_blocks, page_block_size, kv_heads, head_dim] (token-major) to avoid
// // extremely strided loads when the framework stores KV as
// // [num_blocks, kv_heads, page_block_size, head_dim] (head-major).
// if (fa2_materialize_kv) {
// // Materialize per-seq contiguous KV in *sequence order*:
// // [num_seqs, max_num_blocks_per_seq * page_block_size, kv_heads, head_dim].
// const size_t kv_elems =
// info->num_seqs * info->max_num_blocks_per_seq * info->page_block_size * info->num_kv_heads * info->head_size;
// const size_t kv_bytes = kv_elems * sizeof(uint16_t); // 16-bit (fp16/bf16)
// // K + V + alignment slack
// fa2_workspace_bytes += 2 * kv_bytes + 64;
// }
// }
const
size_t
workspace_bytes
=
splitkv_workspace_bytes
;
const
size_t
workspace_bytes
=
splitkv_workspace_bytes
;
// const size_t workspace_bytes = splitkv_workspace_bytes + fa2_workspace_bytes;
// const size_t workspace_bytes = splitkv_workspace_bytes + fa2_workspace_bytes;
...
@@ -1587,130 +1548,3 @@ infiniStatus_t Descriptor::calculate(
...
@@ -1587,130 +1548,3 @@ infiniStatus_t Descriptor::calculate(
}
}
}
// namespace op::paged_attention_prefill::nvidia
}
// namespace op::paged_attention_prefill::nvidia
// #include <cuda_fp16.h>
// #include <float.h>
// #include <math.h>
// #include <stdint.h>
// #include "../../../devices/nvidia/nvidia_common.cuh"
// #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
// #include "../cuda/kernel.cuh"
// #include "paged_attention_prefill_nvidia.cuh"
// template <typename Tdata, typename Tcompute>
// infiniStatus_t launchPagedAttentionPrefill(
// Tdata *out, const Tdata *q, const Tdata *k_cache, const Tdata *v_cache,
// const int64_t *block_tables,
// const int64_t *seq_lens,
// const int64_t *cum_seq_lens_q,
// const float *alibi_slopes,
// const size_t num_heads,
// const size_t num_seqs,
// const size_t num_kv_heads,
// const float scale,
// const size_t max_num_blocks_per_seq,
// const size_t block_size,
// const size_t total_q_tokens,
// const size_t head_size,
// const ptrdiff_t kv_block_stride,
// const ptrdiff_t kv_head_stride,
// const ptrdiff_t q_stride,
// const ptrdiff_t q_head_stride,
// cudaStream_t stream) {
// if (total_q_tokens == 0 || num_heads == 0) {
// return INFINI_STATUS_BAD_TENSOR_SHAPE;
// }
// dim3 grid(total_q_tokens, num_heads);
// dim3 block(head_size);
// op::paged_attention_prefill::cuda::pagedAttentionPrefillKernel<Tdata, Tcompute>
// <<<grid, block, 0, stream>>>(
// out, q, k_cache, v_cache,
// block_tables, seq_lens, cum_seq_lens_q, alibi_slopes,
// num_heads, num_kv_heads, scale,
// max_num_blocks_per_seq, block_size,
// kv_block_stride, kv_head_stride,
// q_stride, q_head_stride,
// head_size,
// num_seqs);
// return INFINI_STATUS_SUCCESS;
// }
// namespace op::paged_attention_prefill::nvidia {
// struct Descriptor::Opaque {
// std::shared_ptr<device::nvidia::Handle::Internal> internal;
// };
// Descriptor::~Descriptor() {
// delete _opaque;
// }
// infiniStatus_t Descriptor::create(
// infiniopHandle_t handle,
// Descriptor **desc_ptr,
// infiniopTensorDescriptor_t out_desc,
// infiniopTensorDescriptor_t q_desc,
// infiniopTensorDescriptor_t k_cache_desc,
// infiniopTensorDescriptor_t v_cache_desc,
// infiniopTensorDescriptor_t block_tables_desc,
// infiniopTensorDescriptor_t seq_lens_desc,
// infiniopTensorDescriptor_t cum_seq_lens_q_desc,
// const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
// float scale) {
// auto info = PagedAttentionPrefillInfo::create(
// out_desc, q_desc, k_cache_desc, v_cache_desc,
// block_tables_desc, seq_lens_desc,
// cum_seq_lens_q_desc,
// alibi_slopes_desc, scale);
// CHECK_RESULT(info);
// *desc_ptr = new Descriptor(
// new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
// info.take(), 0, handle->device, handle->device_id);
// return INFINI_STATUS_SUCCESS;
// }
// infiniStatus_t Descriptor::calculate(
// void *workspace, size_t workspace_size,
// void *out, const void *q, const void *k_cache, const void *v_cache,
// const void *block_tables,
// const void *seq_lens,
// const void *cum_seq_lens_q,
// const void *alibi_slopes,
// void *stream_) const {
// cudaStream_t stream = (cudaStream_t)stream_;
// #define LAUNCH_KERNEL(Tdata, Tcompute) \
// launchPagedAttentionPrefill<Tdata, Tcompute>( \
// (Tdata *)out, (const Tdata *)q, (const Tdata *)k_cache, (const Tdata *)v_cache, \
// (const int64_t *)block_tables, (const int64_t *)seq_lens, (const int64_t *)cum_seq_lens_q, \
// (const float *)alibi_slopes, \
// _info.num_heads, _info.num_seqs, _info.num_kv_heads, \
// _info.scale, _info.max_num_blocks_per_seq, \
// _info.block_size, _info.total_q_tokens, \
// _info.head_size, \
// _info.kv_block_stride, _info.kv_head_stride, \
// _info.q_stride, _info.q_head_stride, \
// stream)
// if (_info.dtype == INFINI_DTYPE_F16) {
// return LAUNCH_KERNEL(half, float);
// } else if (_info.dtype == INFINI_DTYPE_BF16) {
// return LAUNCH_KERNEL(__nv_bfloat16, float);
// } else if (_info.dtype == INFINI_DTYPE_F32) {
// return LAUNCH_KERNEL(float, float);
// }
// return INFINI_STATUS_BAD_TENSOR_DTYPE;
// }
// } // namespace op::paged_attention_prefill::nvidia
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment