issue/1065 - add infinicore packaging for mha kvcache

456ee3e1 · wooway777 · 665f383b · 456ee3e1 · 456ee3e1 · 456ee3e1
Commit 456ee3e1 authored Mar 09, 2026 by wooway777
5 changed files
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -61,6 +61,7 @@ from infinicore.ops.cross_entropy import cross_entropy
 from infinicore.ops.equal import equal
 from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
+from infinicore.ops.mha_kvcache import mha_kvcache
 from infinicore.ops.mha_varlen import mha_varlen
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
@@ -131,16 +132,15 @@ __all__ = [
    "long",
    "short",
    "uint8",
-    # Operations.
+    # Operators.
-    "addcmul",
-    "atanh",
-    "binary_cross_entropy_with_logits",
-    "cdist",
-    "reciprocal",
    "add",
+    "addcmul",
    "add_rms_norm",
    "add_rms_norm_",
+    "atanh",
    "attention",
+    "binary_cross_entropy_with_logits",
+    "cdist",
    "kv_caching",
    "matmul",
    "equal",
@@ -156,11 +156,13 @@ __all__ = [
    "from_list",
    "from_numpy",
    "from_torch",
+    "mha_kvcache",
    "mha_varlen",
    "paged_caching",
    "paged_attention",
    "paged_attention_prefill",
    "ones",
+    "reciprocal",
    "strided_empty",
    "strided_from_blob",
    "zeros",

--- a/python/infinicore/ops/mha_kvcache.py
+++ b/python/infinicore/ops/mha_kvcache.py
+from typing import Optional
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+def mha_kvcache(
+    q: Tensor,
+    k_cache: Tensor,
+    v_cache: Tensor,
+    seqlens_k: Tensor,
+    block_table: Tensor,
+    alibi_slopes: Optional[Tensor] = None,
+    scale: float = 1.0,
+    *,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Flash attention KV-cache decode for single-step attention over a paged KV cache.
+    This function performs attention decoding using a paged KV cache layout,
+    which is efficient for inference with large sequence lengths.
+    Args:
+        q: Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+        k_cache: Key cache tensor of shape [num_blocks, block_size, num_heads_k, head_size] (paged layout)
+        v_cache: Value cache tensor of shape [num_blocks, block_size, num_heads_k, head_size] (paged layout)
+        seqlens_k: Total KV length per request of shape [batch_size] (int32)
+        block_table: Block mapping table of shape [batch_size, max_num_blocks_per_seq] (int32)
+        alibi_slopes: Optional ALiBi slopes tensor, if None then ALiBi is disabled
+        scale: Scaling factor for attention scores (typically 1.0/sqrt(head_size))
+        out: Optional output tensor. If provided, the operation will be performed in-place.
+    Returns:
+        Output tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+    Note:
+        The KV cache uses a paged layout where:
+        - k_cache and v_cache are organized into fixed-size blocks
+        - block_table maps logical positions to physical blocks for each sequence
+        - seqlens_k indicates the current total length of each sequence in the cache
+    """
+    if out is None:
+        return Tensor(
+            _infinicore.mha_kvcache(
+                q._underlying,
+                k_cache._underlying,
+                v_cache._underlying,
+                seqlens_k._underlying,
+                block_table._underlying,
+                alibi_slopes._underlying if alibi_slopes is not None else None,
+                scale,
+            )
+        )
+    _infinicore.mha_kvcache_(
+        out._underlying,
+        q._underlying,
+        k_cache._underlying,
+        v_cache._underlying,
+        seqlens_k._underlying,
+        block_table._underlying,
+        alibi_slopes._underlying if alibi_slopes is not None else None,
+        scale,
+    )
+    return out
--- a/src/infinicore/ops/mha_kvcache/mha_kvcache_flashattn.cc
+++ b/src/infinicore/ops/mha_kvcache/mha_kvcache_flashattn.cc
@@ -2,6 +2,8 @@
 #include "infinicore/adaptor/flash_attention_adaptor.hpp"
+#include <stdexcept>
 namespace infinicore::op::mha_kvcache_impl::flashattn {
 struct PlannedMeta {
@@ -30,26 +32,27 @@ void *plan(Tensor out,
 }
 void run(void *planned_meta) {
+#ifdef ENABLE_FLASH_ATTN
    c10::cuda::CUDAStreamGuard guard(infinicore::adaptor::get_cuda_stream());
    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
-    auto out       = std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(p->out));
+    auto out = std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(p->out));
-    auto q         = infinicore::adaptor::to_aten_tensor(p->q);
+    auto q = infinicore::adaptor::to_aten_tensor(p->q);
-    auto k_cache   = infinicore::adaptor::to_aten_tensor(p->k_cache);
+    auto k_cache = infinicore::adaptor::to_aten_tensor(p->k_cache);
-    auto v_cache   = infinicore::adaptor::to_aten_tensor(p->v_cache);
+    auto v_cache = infinicore::adaptor::to_aten_tensor(p->v_cache);
    auto seqlens_k = std::optional<const at::Tensor>(infinicore::adaptor::to_aten_tensor(p->seqlens_k));
    auto block_table = std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(p->block_table));
    auto alibi_slopes = p->alibi_slopes
-        ? std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(*p->alibi_slopes))
+                          ? std::optional<at::Tensor>(infinicore::adaptor::to_aten_tensor(*p->alibi_slopes))
-        : std::nullopt;
+                          : std::nullopt;
    // No new KV tokens to append (pure decode, KV already written to cache).
-    std::optional<const at::Tensor> k_new        = std::nullopt;
+    std::optional<const at::Tensor> k_new = std::nullopt;
-    std::optional<const at::Tensor> v_new        = std::nullopt;
+    std::optional<const at::Tensor> v_new = std::nullopt;
-    std::optional<const at::Tensor> rotary_cos   = std::nullopt;
+    std::optional<const at::Tensor> rotary_cos = std::nullopt;
-    std::optional<const at::Tensor> rotary_sin   = std::nullopt;
+    std::optional<const at::Tensor> rotary_sin = std::nullopt;
    std::optional<const at::Tensor> cache_batch_idx = std::nullopt;
-    std::optional<const at::Tensor> leftpad_k    = std::nullopt;
+    std::optional<const at::Tensor> leftpad_k = std::nullopt;
    flash::mha_fwd_kvcache(
        q,
@@ -66,13 +69,16 @@ void run(void *planned_meta) {
        alibi_slopes,
        out,
        p->scale,
-        true,   // is_causal
+        true,  // is_causal
-        -1,     // window_size_left  (-1 = no sliding window)
+        -1,    // window_size_left  (-1 = no sliding window)
-        -1,     // window_size_right
+        -1,    // window_size_right
-        0.0f,   // softcap
+        0.0f,  // softcap
-        false,  // is_rotary_interleaved
+        false, // is_rotary_interleaved
-        0       // num_splits (0 = auto)
+        0      // num_splits (0 = auto)
    );
+#else
+    throw std::runtime_error("FlashAttention is not enabled in this build");
+#endif
 }
 void cleanup(void **planned_meta_ptr) {

--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -22,6 +22,7 @@
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
 #include "ops/matmul.hpp"
+#include "ops/mha_kvcache.hpp"
 #include "ops/mha_varlen.hpp"
 #include "ops/mul.hpp"
 #include "ops/paged_attention.hpp"
@@ -54,6 +55,7 @@ inline void bind(py::module &m) {
    bind_linear(m);
    bind_matmul(m);
    bind_mul(m);
+    bind_mha_kvcache(m);
    bind_mha_varlen(m);
    bind_hardswish(m);
    bind_hardtanh(m);

--- a/src/infinicore/pybind11/ops/mha_kvcache.hpp
+++ b/src/infinicore/pybind11/ops/mha_kvcache.hpp
+#pragma once
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/mha_kvcache.hpp"
+namespace py = pybind11;
+namespace infinicore::ops {
+Tensor py_mha_kvcache(Tensor q,
+                      Tensor k_cache,
+                      Tensor v_cache,
+                      Tensor seqlens_k,
+                      Tensor block_table,
+                      pybind11::object alibi_slopes,
+                      float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    return op::mha_kvcache(
+        q,
+        k_cache,
+        v_cache,
+        seqlens_k,
+        block_table,
+        alibi_slopes_tensor,
+        scale);
+}
+void py_mha_kvcache_(Tensor out,
+                     Tensor q,
+                     Tensor k_cache,
+                     Tensor v_cache,
+                     Tensor seqlens_k,
+                     Tensor block_table,
+                     pybind11::object alibi_slopes,
+                     float scale) {
+    std::optional<Tensor> alibi_slopes_tensor = std::nullopt;
+    if (!alibi_slopes.is_none()) {
+        alibi_slopes_tensor = alibi_slopes.cast<Tensor>();
+    }
+    op::mha_kvcache_(
+        out,
+        q,
+        k_cache,
+        v_cache,
+        seqlens_k,
+        block_table,
+        alibi_slopes_tensor,
+        scale);
+}
+inline void bind_mha_kvcache(py::module &m) {
+    m.def(
+        "mha_kvcache",
+        &ops::py_mha_kvcache,
+        py::arg("q"),
+        py::arg("k_cache"),
+        py::arg("v_cache"),
+        py::arg("seqlens_k"),
+        py::arg("block_table"),
+        py::arg("alibi_slopes"),
+        py::arg("scale"),
+        R"doc(Flash attention KV-cache decode for single-step attention over a paged KV cache.
+Parameters
+----------
+q : Tensor
+    Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+k_cache : Tensor
+    Key cache tensor of shape [num_blocks, block_size, num_heads_k, head_size] (paged layout)
+v_cache : Tensor
+    Value cache tensor of shape [num_blocks, block_size, num_heads_k, head_size] (paged layout)
+seqlens_k : Tensor
+    Total KV length per request of shape [batch_size] (int32)
+block_table : Tensor
+    Block mapping table of shape [batch_size, max_num_blocks_per_seq] (int32)
+alibi_slopes : Optional[Tensor]
+    ALiBi slopes tensor, if None then ALiBi is disabled
+scale : float
+    Scaling factor for attention scores (typically 1.0/sqrt(head_size))
+Returns
+-------
+Tensor
+    Output tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+)doc");
+    m.def(
+        "mha_kvcache_",
+        &ops::py_mha_kvcache_,
+        py::arg("out"),
+        py::arg("q"),
+        py::arg("k_cache"),
+        py::arg("v_cache"),
+        py::arg("seqlens_k"),
+        py::arg("block_table"),
+        py::arg("alibi_slopes"),
+        py::arg("scale"),
+        R"doc(In-place flash attention KV-cache decode.
+Parameters
+----------
+out : Tensor
+    Output tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+q : Tensor
+    Query tensor of shape [batch_size, seqlen_q, num_heads, head_size]
+k_cache : Tensor
+    Key cache tensor of shape [num_blocks, block_size, num_heads_k, head_size] (paged layout)
+v_cache : Tensor
+    Value cache tensor of shape [num_blocks, block_size, num_heads_k, head_size] (paged layout)
+seqlens_k : Tensor
+    Total KV length per request of shape [batch_size] (int32)
+block_table : Tensor
+    Block mapping table of shape [batch_size, max_num_blocks_per_seq] (int32)
+alibi_slopes : Optional[Tensor]
+    ALiBi slopes tensor, if None then ALiBi is disabled
+scale : float
+    Scaling factor for attention scores (typically 1.0/sqrt(head_size))
+)doc");
+}
+} // namespace infinicore::ops